[ZARCH] Fix constraints and source code formatting

This commit is contained in:
maamountki 2019-02-11 16:01:13 +02:00 committed by GitHub
parent 7039770165
commit 77fe70019f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 7439 additions and 7354 deletions

View File

@ -34,112 +34,112 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amax;
}

View File

@ -34,112 +34,112 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amin;
}

View File

@ -34,83 +34,83 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}

View File

@ -30,73 +30,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ)
"vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
"vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
#else
"vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%[alpha])\n\t"
"vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%[alpha])\n\t"
#endif
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

View File

@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x)
: "cc");
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {

View File

@ -29,80 +29,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vrepg %%v26,%%v24,1\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vfasb %%v25,%%v25,%%v29\n\t"
"vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%[d]),0"
: "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vrepg %%v26,%%v24,1\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vfasb %%v25,%%v25,%%v29\n\t"
"vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%[d]),0"
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,

View File

@ -30,323 +30,331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v18,16(%[x])\n\t"
"vlrepg %%v19,24(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v18,16(%[x])\n\t"
"vlrepg %%v19,24(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
#else
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[8]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
#else
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[4]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__asm__("vlrepg %%v16,0(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
#else
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap),
"m"(*(const FLOAT (*)[2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
FLOAT alpha_i) {
__asm__(
#if !defined(XCONJ)
"vlrepf %%v0,%[alpha_r]\n\t"
"vlef %%v1,%[alpha_i],0\n\t"
"vlef %%v1,%[alpha_i],2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,%[alpha_i],1\n\t"
"vlef %%v1,%[alpha_i],3\n\t"
"vlrepf %%v0,%[alpha_r]\n\t"
"vlef %%v1,%[alpha_i],0\n\t"
"vlef %%v1,%[alpha_i],2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,%[alpha_i],1\n\t"
"vlef %%v1,%[alpha_i],3\n\t"
#else
"vlef %%v0,%[alpha_r],1\n\t"
"vlef %%v0,%[alpha_r],3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,%[alpha_r],0\n\t"
"vlef %%v0,%[alpha_r],2\n\t"
"vlrepf %%v1,%[alpha_i]\n\t"
"vlef %%v0,%[alpha_r],1\n\t"
"vlef %%v0,%[alpha_r],3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,%[alpha_r],0\n\t"
"vlef %%v0,%[alpha_r],2\n\t"
"vlrepf %%v1,%[alpha_i]\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,0(%%r1,%[dest])\n\t"
"vl %%v19,16(%%r1,%[dest])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
"vst %%v22,0(%%r1,%[dest])\n\t"
"vst %%v23,16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src),
[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,0(%%r1,%[dest])\n\t"
"vl %%v19,16(%%r1,%[dest])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
"vst %%v22,0(%%r1,%[dest])\n\t"
"vst %%v23,16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
[src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,

View File

@ -31,6 +31,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
@ -154,20 +159,23 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])"
: "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
@ -263,13 +271,13 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
"vst %%v20,0(%[y])"
: "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
@ -353,11 +361,11 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
"vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
"vsteg %%v0,0(%[y]),0"
: "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {

View File

@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,

View File

@ -29,171 +29,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32\n\t"
"verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t"
"verllg %%v27,%%v19,32\n\t"
"verllg %%v28,%%v20,32\n\t"
"verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32\n\t"
"verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t"
"verllg %%v27,%%v19,32\n\t"
"verllg %%v28,%%v20,32\n\t"
"verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlef %%v0,4(%[alpha]),0\n\t"
"vlef %%v0,4(%[alpha]),2\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,4(%[alpha]),1\n\t"
"vlef %%v0,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v16,%%v16,32\n\t"
"verllg %%v17,%%v17,32\n\t"
"verllg %%v18,%%v18,32\n\t"
"verllg %%v19,%%v19,32\n\t"
"verllg %%v20,%%v20,32\n\t"
"verllg %%v21,%%v21,32\n\t"
"verllg %%v22,%%v22,32\n\t"
"verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
"vlef %%v0,4(%[alpha]),2\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,4(%[alpha]),1\n\t"
"vlef %%v0,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v16,%%v16,32\n\t"
"verllg %%v17,%%v17,32\n\t"
"verllg %%v18,%%v18,32\n\t"
"verllg %%v19,%%v19,32\n\t"
"verllg %%v20,%%v20,32\n\t"
"verllg %%v21,%%v21,32\n\t"
"verllg %%v22,%%v22,32\n\t"
"verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,

View File

@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,

View File

@ -34,51 +34,51 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
"vfmaxdb %%v19,%%v19,%%v27,8\n\t"
"vfmaxdb %%v20,%%v20,%%v28,8\n\t"
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
"vfmaxdb %%v19,%%v19,%%v27,8\n\t"
"vfmaxdb %%v20,%%v20,%%v28,8\n\t"
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}

View File

@ -34,85 +34,85 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}

View File

@ -34,51 +34,51 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t"
"vfmindb %%v19,%%v19,%%v27,8\n\t"
"vfmindb %%v20,%%v20,%%v28,8\n\t"
"vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t"
"vfmindb %%v19,%%v19,%%v27,8\n\t"
"vfmindb %%v20,%%v20,%%v28,8\n\t"
"vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}

View File

@ -34,85 +34,85 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vflpdb %%v0,%%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}

View File

@ -34,81 +34,81 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}

View File

@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepg %%v0,%[alpha]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),
[alpha] "m"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
[alpha] "Q"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,

View File

@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x)
: "cc");
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {

View File

@ -31,60 +31,60 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),
[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}

View File

@ -31,324 +31,334 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,16(%[x])\n\t"
"vlrepg %%v3,24(%[x])\n\t"
"vlrepg %%v4,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v4\n\t"
"vfmdb %%v1,%%v1,%%v4\n\t"
"vfmdb %%v2,%%v2,%%v4\n\t"
"vfmdb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,16(%[x])\n\t"
"vlrepg %%v3,24(%[x])\n\t"
"vlrepg %%v4,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v4\n\t"
"vfmdb %%v1,%%v1,%%v4\n\t"
"vfmdb %%v2,%%v2,%%v4\n\t"
"vfmdb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v2\n\t"
"vfmdb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v2\n\t"
"vfmdb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v16,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,0(%%r1,%[y])\n\t"
"vl %%v19,16(%%r1,%[y])\n\t"
"vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
"vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
"vst %%v18,0(%%r1,%[y])\n\t"
"vst %%v19,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepg %%v16,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,0(%%r1,%[y])\n\t"
"vl %%v19,16(%%r1,%[y])\n\t"
"vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
"vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
"vst %%v18,0(%%r1,%[y])\n\t"
"vst %%v19,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {

View File

@ -30,333 +30,341 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v18,%%v24,%%v0\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v18,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v18,%%v27,%%v3\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v19,%%v28,%%v4\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v19,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,64(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v20,%%v26,%%v2\n\t"
"vl %%v27,64(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v20,%%v27,%%v3\n\t"
"vl %%v28,80(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v21,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,80(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v21,%%v30,%%v6\n\t"
"vl %%v31,80(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v21,%%v31,%%v7\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v22,%%v24,%%v0\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v22,%%v25,%%v1\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v22,%%v26,%%v2\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v22,%%v27,%%v3\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v23,%%v28,%%v4\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v23,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v1,%%v1,%%v5\n\t"
"vfadb %%v2,%%v2,%%v6\n\t"
"vfadb %%v3,%%v3,%%v7\n\t"
"vrepg %%v4,%%v0,1\n\t"
"adbr %%f0,%%f4\n\t"
"std %%f0,0(%[y])\n\t"
"vrepg %%v4,%%v1,1\n\t"
"adbr %%f1,%%f4\n\t"
"std %%f1,8(%[y])\n\t"
"vrepg %%v4,%%v2,1\n\t"
"adbr %%f2,%%f4\n\t"
"std %%f2,16(%[y])\n\t"
"vrepg %%v4,%%v3,1\n\t"
"adbr %%f3,%%f4\n\t"
"std %%f3,24(%[y])"
: "=m"(*(FLOAT (*)[4]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v18,%%v24,%%v0\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v18,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v18,%%v27,%%v3\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v19,%%v28,%%v4\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v19,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,64(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v20,%%v26,%%v2\n\t"
"vl %%v27,64(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v20,%%v27,%%v3\n\t"
"vl %%v28,80(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v21,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,80(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v21,%%v30,%%v6\n\t"
"vl %%v31,80(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v21,%%v31,%%v7\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v22,%%v24,%%v0\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v22,%%v25,%%v1\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v22,%%v26,%%v2\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v22,%%v27,%%v3\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v23,%%v28,%%v4\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v23,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v1,%%v1,%%v5\n\t"
"vfadb %%v2,%%v2,%%v6\n\t"
"vfadb %%v3,%%v3,%%v7\n\t"
"vrepg %%v4,%%v0,1\n\t"
"adbr %%f0,%%f4\n\t"
"std %%f0,0(%[y])\n\t"
"vrepg %%v4,%%v1,1\n\t"
"adbr %%f1,%%f4\n\t"
"std %%f1,8(%[y])\n\t"
"vrepg %%v4,%%v2,1\n\t"
"adbr %%f2,%%f4\n\t"
"std %%f2,16(%[y])\n\t"
"vrepg %%v4,%%v3,1\n\t"
"adbr %%f3,%%f4\n\t"
"std %%f3,24(%[y])"
: "=m"(*(struct { FLOAT x[4]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
"vl %%v28,32(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v18,%%v28,%%v4\n\t"
"vl %%v29,32(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v18,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap0])\n\t"
"vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap1])\n\t"
"vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v21,%%v26,%%v2\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v21,%%v27,%%v3\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v22,%%v28,%%v4\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v22,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v1,%%v1,%%v3\n\t"
"vfadb %%v1,%%v1,%%v5\n\t"
"vfadb %%v1,%%v1,%%v7\n\t"
"vrepg %%v2,%%v0,1\n\t"
"adbr %%f0,%%f2\n\t"
"std %%f0,0(%[y])\n\t"
"vrepg %%v2,%%v1,1\n\t"
"adbr %%f1,%%f2\n\t"
"std %%f1,8(%[y])"
: "=m"(*(FLOAT (*)[2]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
"vl %%v28,32(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v18,%%v28,%%v4\n\t"
"vl %%v29,32(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v18,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap0])\n\t"
"vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap1])\n\t"
"vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v21,%%v26,%%v2\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v21,%%v27,%%v3\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v22,%%v28,%%v4\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vfmadb %%v5,%%v22,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v1,%%v1,%%v3\n\t"
"vfadb %%v1,%%v1,%%v5\n\t"
"vfadb %%v1,%%v1,%%v7\n\t"
"vrepg %%v2,%%v0,1\n\t"
"adbr %%f0,%%f2\n\t"
"std %%f0,0(%[y])\n\t"
"vrepg %%v2,%%v1,1\n\t"
"adbr %%f1,%%f2\n\t"
"std %%f1,8(%[y])"
: "=m"(*(struct { FLOAT x[2]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[a0])\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,48(%%r1,%[a0])\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vl %%v28,64(%%r1,%[a0])\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[a0])\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,96(%%r1,%[a0])\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[a0])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"std %%f0,0(%[y])"
: "=m"(*(FLOAT (*)[1]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[a0])\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,48(%%r1,%[a0])\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vl %%v28,64(%%r1,%[a0])\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[a0])\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,96(%%r1,%[a0])\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[a0])\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"std %%f0,0(%[y])"
: "=m"(*(FLOAT (*)[1]) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
@ -369,74 +377,74 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
__asm__("vlrepg %%v0,%[da]\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,64(%%r1,%[src])\n\t"
"vl %%v21,80(%%r1,%[src])\n\t"
"vl %%v22,96(%%r1,%[src])\n\t"
"vl %%v23,112(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"vl %%v26, 32(%%r1,%[dest])\n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vst %%v26, 32(%%r1,%[dest])\n\t"
"vl %%v27, 48(%%r1,%[dest])\n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v27, 48(%%r1,%[dest])\n\t"
"vl %%v28, 64(%%r1,%[dest])\n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vst %%v28, 64(%%r1,%[dest])\n\t"
"vl %%v29, 80(%%r1,%[dest])\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vst %%v29, 80(%%r1,%[dest])\n\t"
"vl %%v30, 96(%%r1,%[dest])\n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vst %%v30, 96(%%r1,%[dest])\n\t"
"vl %%v31, 112(%%r1,%[dest])\n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v31, 112(%%r1,%[dest])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) dest)
: [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src),
[src] "a"(src),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,64(%%r1,%[src])\n\t"
"vl %%v21,80(%%r1,%[src])\n\t"
"vl %%v22,96(%%r1,%[src])\n\t"
"vl %%v23,112(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"vl %%v26, 32(%%r1,%[dest])\n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vst %%v26, 32(%%r1,%[dest])\n\t"
"vl %%v27, 48(%%r1,%[dest])\n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v27, 48(%%r1,%[dest])\n\t"
"vl %%v28, 64(%%r1,%[dest])\n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vst %%v28, 64(%%r1,%[dest])\n\t"
"vl %%v29, 80(%%r1,%[dest])\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vst %%v29, 80(%%r1,%[dest])\n\t"
"vl %%v30, 96(%%r1,%[dest])\n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vst %%v30, 96(%%r1,%[dest])\n\t"
"vl %%v31, 112(%%r1,%[dest])\n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v31, 112(%%r1,%[dest])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) dest)
: [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src),
[src] "a"(src),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
BLASLONG inc_dest) {

View File

@ -31,51 +31,51 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v19,%%v19,%%v27,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v19,%%v19,%%v27,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}

View File

@ -31,68 +31,68 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}

View File

@ -31,51 +31,51 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v19,%%v19,%%v27,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v19,%%v19,%%v27,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}

View File

@ -31,68 +31,68 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}

View File

@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,

View File

@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepg %%v0,%[da]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x),[da] "m"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x),[da] "Q"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,

View File

@ -31,91 +31,92 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
double dot;
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,60(%%r1,%[x]),2\n\t"
"vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t"
"vflls %%v19,%%v19\n\t"
"vflls %%v20,%%v20\n\t"
"vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,4(%%r1,%[y]),2\n\t"
"vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,60(%%r1,%[x]),2\n\t"
"vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t"
"vflls %%v19,%%v19\n\t"
"vflls %%v20,%%v20\n\t"
"vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,4(%%r1,%[y]),2\n\t"
"vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}

View File

@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,

View File

@ -34,191 +34,191 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamax;
}

View File

@ -34,191 +34,191 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamin;
}

View File

@ -34,138 +34,138 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax;
}

View File

@ -34,138 +34,138 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin;
}

View File

@ -31,121 +31,121 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax;
}

View File

@ -31,121 +31,121 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"vleig %%v28,8,0\n\t"
"vleig %%v28,9,1\n\t"
"vleig %%v29,10,0\n\t"
"vleig %%v29,11,1\n\t"
"vleig %%v30,12,0\n\t"
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin;
}

View File

@ -34,182 +34,182 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax;
}

View File

@ -34,182 +34,182 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin;
}

View File

@ -31,165 +31,165 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax;
}

View File

@ -31,165 +31,165 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,32\n\t"
"vzero %%v4\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"vleif %%v28,16,0\n\t"
"vleif %%v28,17,1\n\t"
"vleif %%v28,18,2\n\t"
"vleif %%v28,19,3\n\t"
"vleif %%v29,20,0\n\t"
"vleif %%v29,21,1\n\t"
"vleif %%v29,22,2\n\t"
"vleif %%v29,23,3\n\t"
"vleif %%v30,24,0\n\t"
"vleif %%v30,25,1\n\t"
"vleif %%v30,26,2\n\t"
"vleif %%v30,27,3\n\t"
"vleif %%v31,28,0\n\t"
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin;
}

View File

@ -34,134 +34,134 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamax;
}

View File

@ -34,134 +34,134 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,8\n\t"
"vzero %%v3\n\t"
"vleig %%v24,0,0\n\t"
"vleig %%v24,1,1\n\t"
"vleig %%v25,2,0\n\t"
"vleig %%v25,3,1\n\t"
"vleig %%v26,4,0\n\t"
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamin;
}

View File

@ -34,53 +34,53 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
"vfmaxsb %%v19,%%v19,%%v27,8\n\t"
"vfmaxsb %%v20,%%v20,%%v28,8\n\t"
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
"vfmaxsb %%v19,%%v19,%%v27,8\n\t"
"vfmaxsb %%v20,%%v20,%%v28,8\n\t"
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}

View File

@ -34,53 +34,53 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t"
"vfminsb %%v19,%%v19,%%v27,8\n\t"
"vfminsb %%v20,%%v20,%%v28,8\n\t"
"vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t"
"vfminsb %%v19,%%v19,%%v27,8\n\t"
"vfminsb %%v20,%%v20,%%v28,8\n\t"
"vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}

View File

@ -34,83 +34,83 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}

View File

@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepf %%v0,%[alpha]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),
[alpha] "m"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
[alpha] "Q"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,

View File

@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x)
: "cc");
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {

View File

@ -31,64 +31,64 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t"
"ler %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),
[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t"
"ler %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}

View File

@ -31,304 +31,314 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,8(%[x])\n\t"
"vlrepf %%v3,12(%[x])\n\t"
"vlrepf %%v4,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v4\n\t"
"vfmsb %%v1,%%v1,%%v4\n\t"
"vfmsb %%v2,%%v2,%%v4\n\t"
"vfmsb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,8(%[x])\n\t"
"vlrepf %%v3,12(%[x])\n\t"
"vlrepf %%v4,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v4\n\t"
"vfmsb %%v1,%%v1,%%v4\n\t"
"vfmsb %%v2,%%v2,%%v4\n\t"
"vfmsb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v2\n\t"
"vfmsb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmasb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmasb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmasb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v2\n\t"
"vfmsb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmasb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmasb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmasb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v16,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,0(%%r1,%[y])\n\t"
"vfmasb %%v17,%%v16,%%v0,%%v17\n\t"
"vst %%v17,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepf %%v16,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,0(%%r1,%[y])\n\t"
"vfmasb %%v17,%%v16,%%v0,%%v17\n\t"
"vst %%v17,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {

View File

@ -30,330 +30,338 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v17,%%v31,%%v7\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v18,%%v24,%%v0\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v18,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v18,%%v27,%%v3\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v19,%%v28,%%v4\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v19,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,64(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v20,%%v26,%%v2\n\t"
"vl %%v27,64(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v20,%%v27,%%v3\n\t"
"vl %%v28,80(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v21,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,80(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v21,%%v30,%%v6\n\t"
"vl %%v31,80(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v21,%%v31,%%v7\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v22,%%v24,%%v0\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v22,%%v25,%%v1\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v22,%%v26,%%v2\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v22,%%v27,%%v3\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v23,%%v28,%%v4\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v23,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v1,%%v1,%%v5\n\t"
"vfasb %%v2,%%v2,%%v6\n\t"
"vfasb %%v3,%%v3,%%v7\n\t"
"veslg %%v4,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vrepg %%v4,%%v0,1\n\t"
"aebr %%f0,%%f4\n\t"
"ste %%f0,0(%[y])\n\t"
"veslg %%v4,%%v1,32\n\t"
"vfasb %%v1,%%v1,%%v4\n\t"
"vrepg %%v4,%%v1,1\n\t"
"aebr %%f1,%%f4\n\t"
"ste %%f1,4(%[y])\n\t"
"veslg %%v4,%%v2,32\n\t"
"vfasb %%v2,%%v2,%%v4\n\t"
"vrepg %%v4,%%v2,1\n\t"
"aebr %%f2,%%f4\n\t"
"ste %%f2,8(%[y])\n\t"
"veslg %%v4,%%v3,32\n\t"
"vfasb %%v3,%%v3,%%v4\n\t"
"vrepg %%v4,%%v3,1\n\t"
"aebr %%f3,%%f4\n\t"
"ste %%f3,12(%[y])"
: "=m"(*(FLOAT (*)[4]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
"vl %%v28,16(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v17,%%v28,%%v4\n\t"
"vl %%v29,16(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v17,%%v29,%%v5\n\t"
"vl %%v30,16(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v17,%%v30,%%v6\n\t"
"vl %%v31,16(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v17,%%v31,%%v7\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v18,%%v24,%%v0\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v18,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v18,%%v27,%%v3\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v19,%%v28,%%v4\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v19,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,64(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v20,%%v26,%%v2\n\t"
"vl %%v27,64(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v20,%%v27,%%v3\n\t"
"vl %%v28,80(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v21,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,80(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v21,%%v30,%%v6\n\t"
"vl %%v31,80(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v21,%%v31,%%v7\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v22,%%v24,%%v0\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v22,%%v25,%%v1\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v22,%%v26,%%v2\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v22,%%v27,%%v3\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v23,%%v28,%%v4\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v23,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap2])\n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
"vl %%v27,0(%%r1,%[ap3])\n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v1,%%v1,%%v5\n\t"
"vfasb %%v2,%%v2,%%v6\n\t"
"vfasb %%v3,%%v3,%%v7\n\t"
"veslg %%v4,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vrepg %%v4,%%v0,1\n\t"
"aebr %%f0,%%f4\n\t"
"ste %%f0,0(%[y])\n\t"
"veslg %%v4,%%v1,32\n\t"
"vfasb %%v1,%%v1,%%v4\n\t"
"vrepg %%v4,%%v1,1\n\t"
"aebr %%f1,%%f4\n\t"
"ste %%f1,4(%[y])\n\t"
"veslg %%v4,%%v2,32\n\t"
"vfasb %%v2,%%v2,%%v4\n\t"
"vrepg %%v4,%%v2,1\n\t"
"aebr %%f2,%%f4\n\t"
"ste %%f2,8(%[y])\n\t"
"veslg %%v4,%%v3,32\n\t"
"vfasb %%v3,%%v3,%%v4\n\t"
"vrepg %%v4,%%v3,1\n\t"
"aebr %%f3,%%f4\n\t"
"ste %%f3,12(%[y])"
: "=m"(*(struct { FLOAT x[4]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmasb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmasb %%v3,%%v17,%%v27,%%v3\n\t"
"vl %%v28,32(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v18,%%v28,%%v4\n\t"
"vl %%v29,32(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v18,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap0])\n\t"
"vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap1])\n\t"
"vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vfmasb %%v2,%%v21,%%v26,%%v2\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vfmasb %%v3,%%v21,%%v27,%%v3\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v22,%%v28,%%v4\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v22,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v1,%%v1,%%v3\n\t"
"vfasb %%v1,%%v1,%%v5\n\t"
"vfasb %%v1,%%v1,%%v7\n\t"
"veslg %%v2,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vrepg %%v2,%%v0,1\n\t"
"aebr %%f0,%%f2\n\t"
"ste %%f0,0(%[y])\n\t"
"veslg %%v2,%%v1,32\n\t"
"vfasb %%v1,%%v1,%%v2\n\t"
"vrepg %%v2,%%v1,1\n\t"
"aebr %%f1,%%f2\n\t"
"ste %%f1,4(%[y])"
: "=m"(*(FLOAT (*)[2]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"vl %%v26,16(%%r1,%[ap0])\n\t"
"vfmasb %%v2,%%v17,%%v26,%%v2\n\t"
"vl %%v27,16(%%r1,%[ap1])\n\t"
"vfmasb %%v3,%%v17,%%v27,%%v3\n\t"
"vl %%v28,32(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v18,%%v28,%%v4\n\t"
"vl %%v29,32(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v18,%%v29,%%v5\n\t"
"vl %%v30,48(%%r1,%[ap0])\n\t"
"vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
"vl %%v31,48(%%r1,%[ap1])\n\t"
"vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vfmasb %%v2,%%v21,%%v26,%%v2\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vfmasb %%v3,%%v21,%%v27,%%v3\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v22,%%v28,%%v4\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vfmasb %%v5,%%v22,%%v29,%%v5\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,0(%%r1,%[ap1])\n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v1,%%v1,%%v3\n\t"
"vfasb %%v1,%%v1,%%v5\n\t"
"vfasb %%v1,%%v1,%%v7\n\t"
"veslg %%v2,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vrepg %%v2,%%v0,1\n\t"
"aebr %%f0,%%f2\n\t"
"ste %%f0,0(%[y])\n\t"
"veslg %%v2,%%v1,32\n\t"
"vfasb %%v1,%%v1,%%v2\n\t"
"vrepg %%v2,%%v1,1\n\t"
"aebr %%f1,%%f2\n\t"
"ste %%f1,4(%[y])"
: "=m"(*(struct { FLOAT x[2]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[a0])\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,48(%%r1,%[a0])\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vl %%v28,64(%%r1,%[a0])\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[a0])\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,96(%%r1,%[a0])\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[a0])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"veslg %%v1,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vrepg %%v1,%%v0,1\n\t"
"aebr %%f0,%%f1\n\t"
"ste %%f0,0(%[y])"
: "=m"(*(FLOAT (*)[1]) y)
: [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%[a0])\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vl %%v26,32(%%r1,%[a0])\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v27,48(%%r1,%[a0])\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vl %%v28,64(%%r1,%[a0])\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v29,80(%%r1,%[a0])\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vl %%v30,96(%%r1,%[a0])\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v31,112(%%r1,%[a0])\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[a0])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"veslg %%v1,%%v0,32\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vrepg %%v1,%%v0,1\n\t"
"aebr %%f0,%%f1\n\t"
"ste %%f0,0(%[y])"
: "=m"(*(FLOAT (*)[1]) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
@ -366,70 +374,70 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
__asm__("vlrepf %%v0,%[da]\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,64(%%r1,%[src])\n\t"
"vl %%v21,80(%%r1,%[src])\n\t"
"vl %%v22,96(%%r1,%[src])\n\t"
"vl %%v23,112(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"vl %%v26, 32(%%r1,%[dest])\n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vst %%v26, 32(%%r1,%[dest])\n\t"
"vl %%v27, 48(%%r1,%[dest])\n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v27, 48(%%r1,%[dest])\n\t"
"vl %%v28, 64(%%r1,%[dest])\n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vst %%v28, 64(%%r1,%[dest])\n\t"
"vl %%v29, 80(%%r1,%[dest])\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vst %%v29, 80(%%r1,%[dest])\n\t"
"vl %%v30, 96(%%r1,%[dest])\n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vst %%v30, 96(%%r1,%[dest])\n\t"
"vl %%v31, 112(%%r1,%[dest])\n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v31, 112(%%r1,%[dest])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(FLOAT (*)[n]) dest)
: [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src),
[src] "a"(src),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,64(%%r1,%[src])\n\t"
"vl %%v21,80(%%r1,%[src])\n\t"
"vl %%v22,96(%%r1,%[src])\n\t"
"vl %%v23,112(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"vl %%v25, 16(%%r1,%[dest])\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vst %%v25, 16(%%r1,%[dest])\n\t"
"vl %%v26, 32(%%r1,%[dest])\n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vst %%v26, 32(%%r1,%[dest])\n\t"
"vl %%v27, 48(%%r1,%[dest])\n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v27, 48(%%r1,%[dest])\n\t"
"vl %%v28, 64(%%r1,%[dest])\n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vst %%v28, 64(%%r1,%[dest])\n\t"
"vl %%v29, 80(%%r1,%[dest])\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vst %%v29, 80(%%r1,%[dest])\n\t"
"vl %%v30, 96(%%r1,%[dest])\n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vst %%v30, 96(%%r1,%[dest])\n\t"
"vl %%v31, 112(%%r1,%[dest])\n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v31, 112(%%r1,%[dest])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v24, 0(%%r1,%[dest])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vst %%v24, 0(%%r1,%[dest])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
: "+m"(*(struct { FLOAT x[n]; } *) dest)
: [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src),
[src] "a"(src),[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
BLASLONG inc_dest) {

View File

@ -31,53 +31,53 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v19,%%v19,%%v27,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v19,%%v19,%%v27,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}

View File

@ -31,53 +31,53 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v19,%%v19,%%v27,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v19,%%v19,%%v27,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}

View File

@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,

View File

@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepf %%v0,%[da]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x),[da] "m"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0\n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0\n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0\n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0\n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x),[da] "Q"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,

View File

@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,

View File

@ -34,89 +34,89 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}

View File

@ -34,98 +34,98 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
return amax;
}

View File

@ -34,89 +34,89 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}

View File

@ -34,98 +34,98 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n]) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
return amin;
}

View File

@ -34,81 +34,81 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=m"(asum),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}

View File

@ -30,77 +30,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ)
"vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
"vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
#else
"vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%[alpha])\n\t"
"vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%[alpha])\n\t"
#endif
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vpdi %%v24,%%v8,%%v8,4\n\t"
"vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vpdi %%v27,%%v11,%%v11,4\n\t"
"vpdi %%v28,%%v16,%%v16,4\n\t"
"vpdi %%v29,%%v17,%%v17,4\n\t"
"vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vpdi %%v24,%%v8,%%v8,4\n\t"
"vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vpdi %%v27,%%v11,%%v11,4\n\t"
"vpdi %%v28,%%v16,%%v16,4\n\t"
"vpdi %%v29,%%v17,%%v17,4\n\t"
"vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

View File

@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const FLOAT (*)[n * 2]) x)
: "cc");
"0:\n\t"
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {

View File

@ -29,76 +29,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t"
"vfadb %%v25,%%v25,%%v31\n\t"
"vsteg %%v24,0(%[d]),0\n\t"
"vsteg %%v24,8(%[d]),1\n\t"
"vsteg %%v25,16(%[d]),1\n\t"
"vsteg %%v25,24(%[d]),0"
: "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t"
"vfadb %%v25,%%v25,%%v31\n\t"
"vsteg %%v24,0(%[d]),0\n\t"
"vsteg %%v24,8(%[d]),1\n\t"
"vsteg %%v25,16(%[d]),1\n\t"
"vsteg %%v25,24(%[d]),0"
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,

View File

@ -30,235 +30,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 1024
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
"vl %%v18,32(%[x])\n\t"
"vl %%v19,48(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
"vl %%v18,32(%[x])\n\t"
"vl %%v19,48(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v20,8(%[x]),0\n\t"
"wflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%[x]),1\n\t"
"vleg %%v21,24(%[x]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vleg %%v22,40(%[x]),0\n\t"
"wflcdb %%v22,%%v22\n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vleg %%v23,56(%[x]),0\n\t"
"wflcdb %%v23,%%v23\n\t"
"vleg %%v23,48(%[x]),1\n\t"
"vleg %%v20,8(%[x]),0\n\t"
"wflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%[x]),1\n\t"
"vleg %%v21,24(%[x]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vleg %%v22,40(%[x]),0\n\t"
"wflcdb %%v22,%%v22\n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vleg %%v23,56(%[x]),0\n\t"
"wflcdb %%v23,%%v23\n\t"
"vleg %%v23,48(%[x]),1\n\t"
#else
"vleg %%v20,0(%[x]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,8(%[x]),0\n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vflcdb %%v21,%%v21\n\t"
"vleg %%v21,24(%[x]),0\n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vflcdb %%v22,%%v22\n\t"
"vleg %%v22,40(%[x]),0\n\t"
"vleg %%v23,48(%[x]),1\n\t"
"vflcdb %%v23,%%v23\n\t"
"vleg %%v23,56(%[x]),0\n\t"
"vleg %%v20,0(%[x]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,8(%[x]),0\n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vflcdb %%v21,%%v21\n\t"
"vleg %%v21,24(%[x]),0\n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vflcdb %%v22,%%v22\n\t"
"vleg %%v22,40(%[x]),0\n\t"
"vleg %%v23,48(%[x]),1\n\t"
"vflcdb %%v23,%%v23\n\t"
"vleg %%v23,56(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap0])\n\t"
"vlrepg %%v29,24(%%r1,%[ap0])\n\t"
"vlrepg %%v30,16(%%r1,%[ap1])\n\t"
"vlrepg %%v31,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
"vlrepg %%v24,0(%%r1,%[ap2])\n\t"
"vlrepg %%v25,8(%%r1,%[ap2])\n\t"
"vlrepg %%v26,0(%%r1,%[ap3])\n\t"
"vlrepg %%v27,8(%%r1,%[ap3])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[8]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap0])\n\t"
"vlrepg %%v29,24(%%r1,%[ap0])\n\t"
"vlrepg %%v30,16(%%r1,%[ap1])\n\t"
"vlrepg %%v31,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
"vlrepg %%v24,0(%%r1,%[ap2])\n\t"
"vlrepg %%v25,8(%%r1,%[ap2])\n\t"
"vlrepg %%v26,0(%%r1,%[ap3])\n\t"
"vlrepg %%v27,8(%%r1,%[ap3])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,8(%[x]),0\n\t"
"wflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%[x]),1\n\t"
"vleg %%v19,24(%[x]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,16(%[x]),1\n\t"
"vleg %%v18,8(%[x]),0\n\t"
"wflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%[x]),1\n\t"
"vleg %%v19,24(%[x]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,16(%[x]),1\n\t"
#else
"vleg %%v18,0(%[x]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,8(%[x]),0\n\t"
"vleg %%v19,16(%[x]),1\n\t"
"vflcdb %%v19,%%v19\n\t"
"vleg %%v19,24(%[x]),0\n\t"
"vleg %%v18,0(%[x]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,8(%[x]),0\n\t"
"vleg %%v19,16(%[x]),1\n\t"
"vflcdb %%v19,%%v19\n\t"
"vleg %%v19,24(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
"vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[4]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27");
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
"vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27");
}
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__asm__("vl %%v16,0(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v17,8(%[x]),0\n\t"
"wflcdb %%v17,%%v17\n\t"
"vleg %%v17,0(%[x]),1\n\t"
"vleg %%v17,8(%[x]),0\n\t"
"wflcdb %%v17,%%v17\n\t"
"vleg %%v17,0(%[x]),1\n\t"
#else
"vleg %%v17,0(%[x]),1\n\t"
"vflcdb %%v17,%%v17\n\t"
"vleg %%v17,8(%[x]),0\n\t"
"vleg %%v17,0(%[x]),1\n\t"
"vflcdb %%v17,%%v17\n\t"
"vleg %%v17,8(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vlrepg %%v20,16(%%r1,%[ap])\n\t"
"vlrepg %%v21,24(%%r1,%[ap])\n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap),
"m"(*(const FLOAT (*)[2]) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vlrepg %%v20,16(%%r1,%[ap])\n\t"
"vlrepg %%v21,24(%%r1,%[ap])\n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
FLOAT alpha_i) {
__asm__(
#if !defined(XCONJ)
"vlrepg %%v0,%[alpha_r]\n\t"
"vleg %%v1,%[alpha_i],0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,%[alpha_i],1\n\t"
"vlrepg %%v0,%[alpha_r]\n\t"
"vleg %%v1,%[alpha_i],0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,%[alpha_i],1\n\t"
#else
"vleg %%v0,%[alpha_r],1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,%[alpha_r],0\n\t"
"vlrepg %%v1,%[alpha_i]\n\t"
"vleg %%v0,%[alpha_r],1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,%[alpha_r],0\n\t"
"vlrepg %%v1,%[alpha_i]\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,0(%%r1,%[dest])\n\t"
"vl %%v21,16(%%r1,%[dest])\n\t"
"vl %%v22,32(%%r1,%[dest])\n\t"
"vl %%v23,48(%%r1,%[dest])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
"vst %%v28,0(%%r1,%[dest])\n\t"
"vst %%v29,16(%%r1,%[dest])\n\t"
"vst %%v30,32(%%r1,%[dest])\n\t"
"vst %%v31,48(%%r1,%[dest])\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src),
[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,0(%%r1,%[dest])\n\t"
"vl %%v21,16(%%r1,%[dest])\n\t"
"vl %%v22,32(%%r1,%[dest])\n\t"
"vl %%v23,48(%%r1,%[dest])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
"vst %%v28,0(%%r1,%[dest])\n\t"
"vst %%v29,16(%%r1,%[dest])\n\t"
"vst %%v30,32(%%r1,%[dest])\n\t"
"vst %%v31,48(%%r1,%[dest])\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
[src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,

View File

@ -31,266 +31,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,0(%%r1,%[ap2])\n\t"
"vlrepg %%v29,8(%%r1,%[ap2])\n\t"
"vlrepg %%v30,0(%%r1,%[ap3])\n\t"
"vlrepg %%v31,8(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,0(%%r1,%[ap2])\n\t"
"vlrepg %%v29,8(%%r1,%[ap2])\n\t"
"vlrepg %%v30,0(%%r1,%[ap3])\n\t"
"vlrepg %%v31,8(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v20\n\t"
"vfadb %%v17,%%v17,%%v21\n\t"
"vfadb %%v18,%%v18,%%v22\n\t"
"vfadb %%v19,%%v19,%%v23\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v20\n\t"
"vfadb %%v17,%%v17,%%v21\n\t"
"vfadb %%v18,%%v18,%%v22\n\t"
"vfadb %%v19,%%v19,%%v23\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v24,0(%[alpha])\n\t"
"vleg %%v25,8(%[alpha]),0\n\t"
"wflcdb %%v25,%%v25\n\t"
"vleg %%v25,8(%[alpha]),1\n\t"
"vlrepg %%v24,0(%[alpha])\n\t"
"vleg %%v25,8(%[alpha]),0\n\t"
"wflcdb %%v25,%%v25\n\t"
"vleg %%v25,8(%[alpha]),1\n\t"
#else
"vleg %%v24,0(%[alpha]),1\n\t"
"vflcdb %%v24,%%v24\n\t"
"vleg %%v24,0(%[alpha]),0\n\t"
"vlrepg %%v25,8(%[alpha])\n\t"
"vleg %%v24,0(%[alpha]),1\n\t"
"vflcdb %%v24,%%v24\n\t"
"vleg %%v24,0(%[alpha]),0\n\t"
"vlrepg %%v25,8(%[alpha])\n\t"
#endif
"vl %%v26,0(%[y])\n\t"
"vl %%v27,16(%[y])\n\t"
"vl %%v28,32(%[y])\n\t"
"vl %%v29,48(%[y])\n\t"
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
"vfmadb %%v27,%%v21,%%v25,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v24,%%v28\n\t"
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
"vst %%v26,0(%[y])\n\t"
"vst %%v27,16(%[y])\n\t"
"vst %%v28,32(%[y])\n\t"
"vst %%v29,48(%[y])"
: "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]),
"m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vl %%v26,0(%[y])\n\t"
"vl %%v27,16(%[y])\n\t"
"vl %%v28,32(%[y])\n\t"
"vl %%v29,48(%[y])\n\t"
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
"vfmadb %%v27,%%v21,%%v25,%%v27\n\t"
"vfmadb %%v28,%%v18,%%v24,%%v28\n\t"
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
"vst %%v26,0(%[y])\n\t"
"vst %%v27,16(%[y])\n\t"
"vst %%v28,32(%[y])\n\t"
"vst %%v29,48(%[y])"
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v20,16(%%r1,%[ap0])\n\t"
"vlrepg %%v21,24(%%r1,%[ap0])\n\t"
"vlrepg %%v22,16(%%r1,%[ap1])\n\t"
"vlrepg %%v23,24(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v18\n\t"
"vfadb %%v17,%%v17,%%v19\n\t"
"vpdi %%v18,%%v16,%%v16,4\n\t"
"vpdi %%v19,%%v17,%%v17,4\n\t"
"vlrepg %%v20,16(%%r1,%[ap0])\n\t"
"vlrepg %%v21,24(%%r1,%[ap0])\n\t"
"vlrepg %%v22,16(%%r1,%[ap1])\n\t"
"vlrepg %%v23,24(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v18\n\t"
"vfadb %%v17,%%v17,%%v19\n\t"
"vpdi %%v18,%%v16,%%v16,4\n\t"
"vpdi %%v19,%%v17,%%v17,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v20,0(%[alpha])\n\t"
"vleg %%v21,8(%[alpha]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,8(%[alpha]),1\n\t"
"vlrepg %%v20,0(%[alpha])\n\t"
"vleg %%v21,8(%[alpha]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,8(%[alpha]),1\n\t"
#else
"vleg %%v20,0(%[alpha]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%[alpha]),0\n\t"
"vlrepg %%v21,8(%[alpha])\n\t"
"vleg %%v20,0(%[alpha]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%[alpha]),0\n\t"
"vlrepg %%v21,8(%[alpha])\n\t"
#endif
"vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%[y])\n\t"
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])\n\t"
: "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]),
"m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
"vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%[y])\n\t"
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])\n\t"
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
}
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vzero %%v17\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v18,16(%%r1,%[ap])\n\t"
"vlrepg %%v19,24(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vpdi %%v17,%%v16,%%v16,4\n\t"
"vlrepg %%v18,16(%%r1,%[ap])\n\t"
"vlrepg %%v19,24(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"agfi %%r1,32\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vpdi %%v17,%%v16,%%v16,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v18,0(%[alpha])\n\t"
"vleg %%v19,8(%[alpha]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,8(%[alpha]),1\n\t"
"vlrepg %%v18,0(%[alpha])\n\t"
"vleg %%v19,8(%[alpha]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,8(%[alpha]),1\n\t"
#else
"vleg %%v18,0(%[alpha]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%[alpha]),0\n\t"
"vlrepg %%v19,8(%[alpha])\n\t"
"vleg %%v18,0(%[alpha]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%[alpha]),0\n\t"
"vlrepg %%v19,8(%[alpha])\n\t"
#endif
"vl %%v0,0(%[y])\n\t"
"vfmadb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmadb %%v0,%%v17,%%v19,%%v0\n\t"
"vst %%v0,0(%[y])\n\t"
: "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap),
"m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x),
"m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
"vl %%v0,0(%[y])\n\t"
"vfmadb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmadb %%v0,%%v17,%%v19,%%v0\n\t"
"vst %%v0,0(%[y])\n\t"
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {

View File

@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,

View File

@ -29,167 +29,170 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vpdi %%v28,%%v20,%%v20,4\n\t"
"vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vpdi %%v28,%%v20,%%v20,4\n\t"
"vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vleg %%v0,8(%[alpha]),0\n\t"
"wflcdb %%v0,%%v0\n\t"
"vleg %%v0,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t"
"vpdi %%v18,%%v18,%%v18,4\n\t"
"vpdi %%v19,%%v19,%%v19,4\n\t"
"vpdi %%v20,%%v20,%%v20,4\n\t"
"vpdi %%v21,%%v21,%%v21,4\n\t"
"vpdi %%v22,%%v22,%%v22,4\n\t"
"vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
"wflcdb %%v0,%%v0\n\t"
"vleg %%v0,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t"
"vpdi %%v18,%%v18,%%v18,4\n\t"
"vpdi %%v19,%%v19,%%v19,4\n\t"
"vpdi %%v20,%%v20,%%v20,4\n\t"
"vpdi %%v21,%%v21,%%v21,4\n\t"
"vpdi %%v22,%%v22,%%v22,4\n\t"
"vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,

View File

@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,