Merge pull request #1426 from quickwritereader/develop

(Z13 ) Blas1 mikrokernels can be inlined by gcc. Refactoring,fixes,tunings
This commit is contained in:
Martin Kroeker 2018-01-20 17:34:54 +01:00 committed by GitHub
commit e4c71a799a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 1535 additions and 1742 deletions

View File

@ -41,17 +41,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum ;
__asm__ (
"pfd 1, 0(%3) \n\t"
"sllg %%r0,%2,3 \n\t"
"agr %%r0,%3 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t"
"pfd 1, 256(%[ptr_temp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
@ -71,7 +71,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%1) \n\t"
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
@ -81,7 +81,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %1,256(%1) \n\t"
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
@ -91,15 +91,15 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"clgrjl %[ptr_temp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v2,%%v3 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 \n\t"
: "=f"(asum),"+&a"(x)
: "r"(n), "1"(x)
"ldr %[asum],%%f0 \n\t"
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;

View File

@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#define Z13_D 1
#define PREFETCH_INS 1
#if defined(Z13_A)
#include <vecintrin.h>
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG i = 0;
__vector double v_a = {*alpha,*alpha};
__vector double v_a = {alpha,alpha};
__vector double * v_y=(__vector double *)y;
__vector double * v_x=(__vector double *)x;
@ -60,230 +60,27 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
}
}
#elif defined(Z13_B)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
#endif
"vl %%v24, 0(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vl %%v24,( 0+64)(%%r1,%2) \n\t"
"vl %%v16,( 0+64)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+64)(%%r1,%2) \n\t"
"vl %%v25, (16+64)(%%r1,%2) \n\t"
"vl %%v17, (16+64)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+64)(%%r1,%2) \n\t"
"vl %%v26, (32+64)(%%r1,%2) \n\t"
"vl %%v18, (32+64)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+64)(%%r1,%2) \n\t"
"vl %%v27, (48+64)(%%r1,%2) \n\t"
"vl %%v19, (48+64)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+64)(%%r1,%2) \n\t"
"vl %%v24,( 0+128)(%%r1,%2) \n\t"
"vl %%v16,( 0+128)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+128)(%%r1,%2) \n\t"
"vl %%v25, (16+128)(%%r1,%2) \n\t"
"vl %%v17, (16+128)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+128)(%%r1,%2) \n\t"
"vl %%v26, (32+128)(%%r1,%2) \n\t"
"vl %%v18, (32+128)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+128)(%%r1,%2) \n\t"
"vl %%v27, (48+128)(%%r1,%2) \n\t"
"vl %%v19, (48+128)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+128)(%%r1,%2) \n\t"
"vl %%v24,( 0+192)(%%r1,%2) \n\t"
"vl %%v16,( 0+192)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+192)(%%r1,%2) \n\t"
"vl %%v25, (16+192)(%%r1,%2) \n\t"
"vl %%v17, (16+192)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+192)(%%r1,%2) \n\t"
"vl %%v26, (32+192)(%%r1,%2) \n\t"
"vl %%v18, (32+192)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+192)(%%r1,%2) \n\t"
"vl %%v27, (48+192)(%%r1,%2) \n\t"
"vl %%v19, (48+192)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+192)(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27"
);
}
#elif defined(Z13_C)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
#else
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %%r0,%[n],5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"pfd 1, 256(%[x_tmp]) \n\t"
"pfd 2, 256(%[y_tmp]) \n\t"
#endif
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vl %%v16, (0+128)(%%r1,%1) \n\t"
"vl %%v17, (16+128)(%%r1,%1) \n\t"
"vl %%v18, (32+128)(%%r1,%1) \n\t"
"vl %%v19, (48+128)(%%r1,%1) \n\t"
"vl %%v24, (0+128)(%%r1,%2) \n\t"
"vl %%v25, (16+128)(%%r1,%2) \n\t"
"vl %%v26, (32+128)(%%r1,%2) \n\t"
"vl %%v27, (48+128)(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vst %%v16, (0+128)(%%r1,%2) \n\t"
"vst %%v17, (16+128)(%%r1,%2) \n\t"
"vst %%v18, (32+128)(%%r1,%2) \n\t"
"vst %%v19, (48+128)(%%r1,%2) \n\t"
"vl %%v24, (64+128)(%%r1,%1) \n\t"
"vl %%v25, (80+128)(%%r1,%1) \n\t"
"vl %%v26, (96+128)(%%r1,%1) \n\t"
"vl %%v27, (112+128)(%%r1,%1) \n\t"
"vl %%v16, (64+128)(%%r1,%2) \n\t"
"vl %%v17, (80+128)(%%r1,%2) \n\t"
"vl %%v18, (96+128)(%%r1,%2) \n\t"
"vl %%v19, (112+128)(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vst %%v24, (64+128)(%%r1,%2) \n\t"
"vst %%v25, (80+128)(%%r1,%2) \n\t"
"vst %%v26, (96+128)(%%r1,%2) \n\t"
"vst %%v27, (112+128)(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27"
);
}
#elif defined(Z13_D)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%1) \n\t"
"pfd 2, 256(%2) \n\t"
#endif
"vlm %%v16,%%v23, 0(%1) \n\t"
"vlm %%v24, %%v31, 0(%2) \n\t"
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
@ -292,9 +89,9 @@ static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FL
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
"vstm %%v16,%%v23, 0(%2) \n\t"
"vlm %%v24,%%v31, 128(%1) \n\t"
"vlm %%v16,%%v23, 128(%2) \n\t"
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
@ -303,13 +100,13 @@ static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FL
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
"la %1,256(%1) \n\t"
"vstm %%v24, %%v31, 128(%2) \n\t"
"la %2,256(%2) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21",
"la %[x_tmp],256(%[x_tmp]) \n\t"
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
"la %[y_tmp],256(%[y_tmp]) \n\t"
"brctg %%r0,1b"
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32;
if ( n1 )
daxpy_kernel_32(n1, x, y , &da );
daxpy_kernel_32(n1, x, y , da );
i = n1;
while(i < n)

View File

@ -30,83 +30,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(Z13mvc)
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
".align 16 \n\t"
"1: \n\t"
"mvc 0(256,%2),0(%1) \n\t"
"la %1,256(%1) \n\t"
"la %2,256(%2) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0"
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
"la %[ptr_x],256(%[ptr_x]) \n\t"
"la %[ptr_y],256(%[ptr_y]) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[n])x),
[ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc"
);
return;
}
#else
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vst %%v24, 192(%%r1,%2) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vst %%v25, 208(%%r1,%2) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vst %%v26, 224(%%r1,%2) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vst %%v27, 240(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v24","v25","v26","v27"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27"
);
return;

View File

@ -30,65 +30,67 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(Z13)
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;
__asm__ volatile(
"pfd 1, 0(%2) \n\t"
"pfd 1, 0(%3) \n\t"
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %1,%1,4 \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%3) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%3) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 16(%%r1,%3) \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 32(%%r1,%3) \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 48(%%r1,%3) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %1,1b \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
"ldr %0, %%f1 \n\t"
: "=f"(dot) ,"+&r"(n)
: "a"(x),"a"(y)
:"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23",
"ldr %[dot], %%f1 \n\t"
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[n])x),
[mem_y] "m"( *(const double (*)[n])y),
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
);
@ -99,7 +101,7 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
#else
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
@ -114,8 +116,17 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
dot += y[i+8] * x[i+8]
+ y[i+9] * x[i+9]
+ y[i+10] * x[i+10]
+ y[i+11] * x[i+11]
+ y[i+12] * x[i+12]
+ y[i+13] * x[i+13]
+ y[i+14] * x[i+14]
+ y[i+15] * x[i+15] ;
i+=8 ;
i+=16 ;
}
return dot;
@ -138,10 +149,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -16;
if ( n1 )
dot = ddot_kernel_8(n1, x, y );
if ( n1 ){
dot = ddot_kernel_16(n1, x, y );
i = n1;
}
while(i < n)
{

View File

@ -25,34 +25,31 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
{
__asm__ (
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,0(%4) \n\t"
"srlg %%r0,%0,5 \n\t"
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -72,25 +69,23 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -110,25 +105,23 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -148,25 +141,23 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -186,33 +177,28 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
: "cc", "memory","r0","r1" ,"v0","v1","v16",
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc", "r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
@ -228,10 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
drot_kernel_32(n1, x, y, c, s);
i=n1;
}
@ -245,7 +229,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
@ -267,4 +250,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}

View File

@ -28,78 +28,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(Z13)
static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x )
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{
__asm__ ("pfd 2, 0(%1) \n\t"
"vrepg %%v0 , %%v0,0 \n\t"
"sllg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"vlr %%v1,%%v0 \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v1 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v1 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v1 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v1 \n\t"
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%1) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%1) \n\t"
"la %%r1,128(%%r1) \n\t"
"clgrjl %%r1,%%r0,1b \n\t"
:
:"r"(n),"a"(x),"f"(da)
:"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27"
"vfmdb %%v27,%%v27,%%v1 \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vfmdb %%v30,%%v30,%%v0 \n\t"
"vfmdb %%v31,%%v31,%%v1 \n\t"
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
"la %[x_ptr], 256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x )
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
{
__asm__ ("pfd 2, 0(%1) \n\t"
"vzero %%v0 \n\t"
"sllg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"sllg %%r0,%[n],3 \n\t"
"vzero %%v25 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v0, 16(%%r1,%1) \n\t"
"vst %%v0, 32(%%r1,%1) \n\t"
"vst %%v0, 48(%%r1,%1) \n\t"
"vst %%v0, 64(%%r1,%1) \n\t"
"vst %%v0, 80(%%r1,%1) \n\t"
"vst %%v0, 96(%%r1,%1) \n\t"
"vst %%v0, 112(%%r1,%1) \n\t"
"la %%r1,128(%%r1) \n\t"
"clgrjl %%r1,%%r0,1b \n\t"
:
:"r"(n),"a"(x),"f"(da)
:"cc" , "memory" ,"r0","r1","v0"
"pfd 2, 256(%[x_ptr]) \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vst %%v24, 32(%[x_ptr]) \n\t"
"vst %%v25, 48(%[x_ptr]) \n\t"
"vst %%v24, 64(%[x_ptr]) \n\t"
"vst %%v25, 80(%[x_ptr]) \n\t"
"vst %%v24, 96(%[x_ptr]) \n\t"
"vst %%v25, 112(%[x_ptr]) \n\t"
"vst %%v24, 128(%[x_ptr]) \n\t"
"vst %%v25, 144(%[x_ptr]) \n\t"
"vst %%v24, 160(%[x_ptr]) \n\t"
"vst %%v25, 176(%[x_ptr]) \n\t"
"vst %%v24, 192(%[x_ptr]) \n\t"
"vst %%v25, 208(%[x_ptr]) \n\t"
"vst %%v24, 224(%[x_ptr]) \n\t"
"vst %%v25, 240(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" , "r0", "v24" ,"v25"
);
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
@ -114,11 +123,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
BLASLONG n1 = n & -16;
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dscal_kernel_8_zero(n1 , da , x);
dscal_kernel_32_zero(n1 , x);
j=n1;
}
@ -133,10 +142,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
BLASLONG n1 = n & -16;
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dscal_kernel_8(n1 , da , x);
dscal_kernel_32(n1 , da , x);
j=n1;
}
while(j < n)

View File

@ -29,299 +29,205 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#define Z13_SWAP_C 1
#if defined(Z13_SWAP_A)
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vst %%v16, 0(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%1) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%1) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v16, 128(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v17, 144(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v18, 160(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v19, 176(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v20, 192(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#elif defined(Z13_SWAP_B)
#else
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vl %%v31, 112(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%2)\n\t"
"vst %%v16, 0(%%r1,%1) \n\t"
"vst %%v17, 16(%%r1,%1) \n\t"
"vst %%v18, 32(%%r1,%1) \n\t"
"vst %%v19, 48(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%1) \n\t"
"vst %%v21, 80(%%r1,%1) \n\t"
"vst %%v22, 96(%%r1,%1) \n\t"
"vst %%v23, 112(%%r1,%1)\n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vst %%v16, 128(%%r1,%1) \n\t"
"vst %%v17, 144(%%r1,%1) \n\t"
"vst %%v18, 160(%%r1,%1) \n\t"
"vst %%v19, 176(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%1) \n\t"
"vst %%v21, 208(%%r1,%1) \n\t"
"vst %%v22, 224(%%r1,%1) \n\t"
"vst %%v23, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#elif defined(Z13_SWAP_C)
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "memory","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

View File

@ -43,15 +43,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) maximum absolute value .( only for output )
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%4) \n\t"
"sllg %%r0,%3,3 \n\t"
"agr %%r0,%4 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
@ -66,8 +66,8 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%2 ) \n\t"
"vlm %%v24,%%v31, 0(%2 ) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -106,7 +106,7 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%2 ) \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -136,7 +136,7 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %2,256(%2) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
@ -146,25 +146,26 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %2,%%r0,1b \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%1,0 \n\t"
"vsteg %%v18,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"3: "
: "=r"(index) ,"=m"(*maxf) , "+&a"(x)
: "r"(n), "2"(x)
"std %%f0,%[maxf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

View File

@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%4) \n\t"
"sllg %%r0,%3,3 \n\t"
"agr %%r0,%4 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
@ -60,14 +60,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vlrepg %%v18,0(%4) \n\t"
"vlrepg %%v18,0(%[ptr_x]) \n\t"
"vzero %%v5 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%2 ) \n\t"
"vlm %%v24,%%v31, 0(%2 ) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
@ -114,7 +114,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vag %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%2 ) \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -147,7 +147,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %2,256(%2) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t"
@ -161,27 +161,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %2,%%r0,1b \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%1,0 \n\t"
"vsteg %%v18,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"3:"
"std %%f0,%[minf] \n\t"
: "+r"(index) ,"=m"(*minf),"+&a"(x)
: "r"(n), "2"(x)
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"

View File

@ -37,16 +37,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* Find maximum index
* Warning: requirements n>0 and n % 8 == 0
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) maximum absolute value .( only for output )
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%4) \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
@ -65,32 +65,32 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vleig %%v23,15,1 \n\t"
"sllg %%r0,%3,4 \n\t"
"agr %%r0,%4 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v6 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%2 ) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vleg %%v24 , 0( %2),0 \n\t"
"vleg %%v25 , 8( %2),0 \n\t"
"vleg %%v24 , 16( %2),1 \n\t"
"vleg %%v25 , 24( %2),1 \n\t"
"vleg %%v26 , 32( %2),0 \n\t"
"vleg %%v27 , 40( %2),0 \n\t"
"vleg %%v26 , 48( %2),1 \n\t"
"vleg %%v27 , 56( %2),1 \n\t"
"vleg %%v28 , 64( %2),0 \n\t"
"vleg %%v29 , 72( %2),0 \n\t"
"vleg %%v28 , 80( %2),1 \n\t"
"vleg %%v29 , 88( %2),1 \n\t"
"vleg %%v30 , 96( %2),0 \n\t"
"vleg %%v31 ,104( %2),0 \n\t"
"vleg %%v30 ,112( %2),1 \n\t"
"vleg %%v31 ,120( %2),1 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -106,22 +106,22 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128( %2),0 \n\t"
"vleg %%v25 , 136( %2),0 \n\t"
"vleg %%v24 , 144( %2),1 \n\t"
"vleg %%v25 , 152( %2),1 \n\t"
"vleg %%v26 , 160( %2),0 \n\t"
"vleg %%v27 , 168( %2),0 \n\t"
"vleg %%v26 , 176( %2),1 \n\t"
"vleg %%v27 , 184( %2),1 \n\t"
"vleg %%v28 , 192( %2),0 \n\t"
"vleg %%v29 , 200( %2),0 \n\t"
"vleg %%v28 , 208( %2),1 \n\t"
"vleg %%v29 , 216( %2),1 \n\t"
"vleg %%v30 , 224( %2),0 \n\t"
"vleg %%v31 , 232( %2),0 \n\t"
"vleg %%v30 , 240( %2),1 \n\t"
"vleg %%v31 , 248( %2),1 \n\t"
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -160,7 +160,7 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %2,256(%2) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
@ -175,26 +175,26 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %2,%%r0,1b \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%1,0 \n\t"
"vsteg %%v6,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
: "=r"(index),"=m"(*maxf),"+&a"(x)
: "r"(n), "2"(x)
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
@ -220,12 +220,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (inc_x == 1) {
BLASLONG n1 = n & -8;
BLASLONG n1 = n & -16;
if (n1 > 0) {
max = ziamax_kernel_8_TUNED(n1, x, &maxf);
max = ziamax_kernel_16_TUNED(n1, x, &maxf);
i = n1;
ix = n1 << 1;
}
while(i < n)

View File

@ -35,16 +35,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* Find minimum index
* Warning: requirements n>0 and n % 8 == 0
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index ;
__asm__(
"pfd 1, 0(%4) \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
@ -61,37 +61,37 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vleig %%v22,13,1 \n\t"
"vleig %%v23,14,0 \n\t"
"vleig %%v23,15,1 \n\t"
"ld %%f6,0(%4) \n\t"
"ld %%f6,0(%[ptr_x]) \n\t"
"lpdbr %%f6,%%f6 \n\t"
"ld %%f7,8(%4) \n\t"
"ld %%f7,8(%[ptr_x]) \n\t"
"lpdbr %%f7,%%f7 \n\t"
"adbr %%f6,%%f7 \n\t"
"sllg %%r0,%3,4 \n\t"
"agr %%r0,%4 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vrepg %%v6,%%v6,0 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%2 ) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vleg %%v24 , 0( %2),0 \n\t"
"vleg %%v25 , 8( %2),0 \n\t"
"vleg %%v24 , 16( %2),1 \n\t"
"vleg %%v25 , 24( %2),1 \n\t"
"vleg %%v26 , 32( %2),0 \n\t"
"vleg %%v27 , 40( %2),0 \n\t"
"vleg %%v26 , 48( %2),1 \n\t"
"vleg %%v27 , 56( %2),1 \n\t"
"vleg %%v28 , 64( %2),0 \n\t"
"vleg %%v29 , 72( %2),0 \n\t"
"vleg %%v28 , 80( %2),1 \n\t"
"vleg %%v29 , 88( %2),1 \n\t"
"vleg %%v30 , 96( %2),0 \n\t"
"vleg %%v31 ,104( %2),0 \n\t"
"vleg %%v30 ,112( %2),1 \n\t"
"vleg %%v31 ,120( %2),1 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -107,22 +107,22 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128( %2),0 \n\t"
"vleg %%v25 ,136( %2),0 \n\t"
"vleg %%v24 ,144( %2),1 \n\t"
"vleg %%v25 ,152( %2),1 \n\t"
"vleg %%v26 ,160( %2),0 \n\t"
"vleg %%v27 ,168( %2),0 \n\t"
"vleg %%v26 ,176( %2),1 \n\t"
"vleg %%v27 ,184( %2),1 \n\t"
"vleg %%v28 ,192( %2),0 \n\t"
"vleg %%v29 ,200( %2),0 \n\t"
"vleg %%v28 ,208( %2),1 \n\t"
"vleg %%v29 ,216( %2),1 \n\t"
"vleg %%v30 ,224( %2),0 \n\t"
"vleg %%v31 ,232( %2),0 \n\t"
"vleg %%v30 ,240( %2),1 \n\t"
"vleg %%v31 ,248( %2),1 \n\t"
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
@ -162,7 +162,7 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %2,256(%2) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
@ -177,27 +177,27 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %2,%%r0,1b \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%1,0 \n\t"
"vsteg %%v6,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"
: "+r"(index) ,"=m"(*minf), "+&a"(x)
: "r"(n), "2"(x)
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
@ -224,12 +224,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (inc_x == 1) {
BLASLONG n1 = n & -8;
BLASLONG n1 = n & -16;
if (n1 > 0) {
min = ziamin_kernel_8_TUNED(n1, x, &minf);
min = ziamin_kernel_16_TUNED(n1, x, &minf);
i = n1;
ix = n1 << 1;
}
else {
//assign minf

View File

@ -44,17 +44,17 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ (
"pfd 1, 0(%3) \n\t"
"sllg %%r0,%2,4 \n\t"
"agr %%r0,%3 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v22 \n\t"
"vzero %%v23 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31,0(%1) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
@ -74,7 +74,7 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%1 ) \n\t"
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
@ -84,7 +84,7 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %1,256(%1) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
@ -94,15 +94,15 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v23,%%v22 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0 ,%%f0"
: "=f"(asum),"+&a"(x)
: "r"(n), "1"(x)
"ldr %[asum] ,%%f0"
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;

View File

@ -28,36 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__ ("pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v28 , 0(%3) \n\t"
"vlrepg %%v29, 8(%3) \n\t"
"srlg %3,%0,3 \n\t"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
"lgdr %%r1,%[alpha_r] \n\t"
"vlvgp %%v28,%%r1,%%r1 \n\t"
"lgdr %%r1,%[alpha_i] \n\t"
"vlvgp %%v29,%%r1,%%r1 \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vleg %%v16 , 0(%%r1,%2),0 \n\t"
"vleg %%v17 , 8(%%r1,%2),0 \n\t"
"vleg %%v16 , 16(%%r1,%2),1 \n\t"
"vleg %%v17 , 24(%%r1,%2),1 \n\t"
"vleg %%v18 , 32(%%r1,%2),0 \n\t"
"vleg %%v19 , 40(%%r1,%2),0 \n\t"
"vleg %%v18 , 48(%%r1,%2),1 \n\t"
"vleg %%v19 , 56(%%r1,%2),1 \n\t"
"vleg %%v24 , 0(%%r1,%1),0 \n\t"
"vleg %%v25 , 8(%%r1,%1),0 \n\t"
"vleg %%v24 , 16(%%r1,%1),1 \n\t"
"vleg %%v25 , 24(%%r1,%1),1 \n\t"
"vleg %%v26 , 32(%%r1,%1),0 \n\t"
"vleg %%v27 , 40(%%r1,%1),0 \n\t"
"vleg %%v26 , 48(%%r1,%1),1 \n\t"
"vleg %%v27 , 56(%%r1,%1),1 \n\t"
"pfd 1, 256(%%r1,%[x_tmp]) \n\t"
"pfd 2, 256(%%r1,%[y_tmp]) \n\t"
"vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t"
#if !defined(CONJ)
"vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t"
"vfmadb %%v17, %%v24, %%v29, %%v17 \n\t"
@ -79,35 +79,35 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
"vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t"
#endif
"vsteg %%v16 , 0(%%r1,%2),0 \n\t"
"vsteg %%v17 , 8(%%r1,%2),0 \n\t"
"vsteg %%v16 , 16(%%r1,%2),1 \n\t"
"vsteg %%v17 , 24(%%r1,%2),1 \n\t"
"vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v18 , 32(%%r1,%2),0 \n\t"
"vsteg %%v19 , 40(%%r1,%2),0 \n\t"
"vsteg %%v18 , 48(%%r1,%2),1 \n\t"
"vsteg %%v19 , 56(%%r1,%2),1 \n\t"
"vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v20 , 64(%%r1,%2),0 \n\t"
"vleg %%v21 , 72(%%r1,%2),0 \n\t"
"vleg %%v20 , 80(%%r1,%2),1 \n\t"
"vleg %%v21 , 88(%%r1,%2),1 \n\t"
"vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v22 , 96(%%r1,%2),0 \n\t"
"vleg %%v23 , 104(%%r1,%2),0 \n\t"
"vleg %%v22 , 112(%%r1,%2),1 \n\t"
"vleg %%v23 , 120(%%r1,%2),1 \n\t"
"vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t"
"vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t"
"vleg %%v24 , 64(%%r1,%1),0 \n\t"
"vleg %%v25 , 72(%%r1,%1),0 \n\t"
"vleg %%v24 , 80(%%r1,%1),1 \n\t"
"vleg %%v25 , 88(%%r1,%1),1 \n\t"
"vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v26 , 96(%%r1,%1),0 \n\t"
"vleg %%v27 , 104(%%r1,%1),0 \n\t"
"vleg %%v26 , 112(%%r1,%1),1 \n\t"
"vleg %%v27 , 120(%%r1,%1),1 \n\t"
"vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t"
"vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t"
"vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t"
#if !defined(CONJ)
"vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t"
"vfmadb %%v21, %%v24, %%v29, %%v21 \n\t"
@ -128,21 +128,21 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
"vfmadb %%v22, %%v26, %%v28, %%v22 \n\t"
"vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t"
#endif
"vsteg %%v20 , 64(%%r1,%2),0 \n\t"
"vsteg %%v21 , 72(%%r1,%2),0 \n\t"
"vsteg %%v20 , 80(%%r1,%2),1 \n\t"
"vsteg %%v21 , 88(%%r1,%2),1 \n\t"
"vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v22 , 96(%%r1,%2),0 \n\t"
"vsteg %%v23 , 104(%%r1,%2),0 \n\t"
"vsteg %%v22 , 112(%%r1,%2),1 \n\t"
"vsteg %%v23 , 120(%%r1,%2),1 \n\t"
"vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t"
"vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t"
"vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %3,1b"
:
: "r"(n), "a"(x), "a"(y), "a"(alpha)
: "cc", "memory", "r1","v16",
"clgrjl %%r1,%[tmp],1b \n\t"
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n)
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
);
@ -151,7 +151,6 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];
if (n <= 0) return (0);
@ -160,9 +159,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
zaxpy_kernel_8(n1, x, y, da_r,da_i);
ix = 2 * n1;
}
i = n1;

View File

@ -27,62 +27,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,4 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24,128(%%r1,%1) \n\t"
"vst %%v24,128(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25,144(%%r1,%1) \n\t"
"vst %%v25,144(%%r1,%2) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26,160(%%r1,%1) \n\t"
"vst %%v26,160(%%r1,%2) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27,176(%%r1,%1) \n\t"
"vst %%v27,176(%%r1,%2) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

View File

@ -32,26 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__ volatile(
"pfd 1, 0(%2) \n\t"
"pfd 1, 0(%3) \n\t"
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %1,%1,3 \n\t"
"srlg %[n_tmp],%[n_tmp],3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%3) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v28, 0(%%r1,%3) \n\t"
"vl %%v29, 16(%%r1,%3) \n\t"
"vl %%v30, 32(%%r1,%3) \n\t"
"vl %%v31, 48(%%r1,%3) \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
@ -69,14 +69,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19,112(%%r1,%2) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
@ -92,15 +92,17 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
"la %%r1,128(%%r1) \n\t"
"brctg %1,1b \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24,0(%4),0 \n\t"
"vsteg %%v24,8(%4),1 \n\t"
"vsteg %%v25,16(%4),1 \n\t"
"vsteg %%v25,24(%4),0 \n\t"
: "=m"(*d) ,"+&r"(n)
: "a"(x), "a"(y), "a"(d)
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[2*n])x),
[mem_y] "m"( *(const double (*)[2*n])y),
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
: "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
@ -150,8 +152,8 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
BLASLONG i = 0;
BLASLONG ix=0, iy=0;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
@ -164,13 +166,15 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
BLASLONG n1 = n & -8;
BLASLONG j=0;
if (n1)
if (n1){
zdot_kernel_8(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
j = n1 <<1;
}
while (i < n) {

View File

@ -25,31 +25,31 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
{
__asm__ (
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,0(%4) \n\t"
"srlg %%r0,%0,4 \n\t"
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -69,25 +69,23 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19,112(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -107,25 +105,23 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -145,25 +141,23 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
@ -183,32 +177,28 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
: "cc", "memory","r0","r1" ,"v0","v1","v16",
"clgrjl %%r1,%[tmp],1b \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
@ -225,10 +215,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
zrot_kernel_16(n1, x, y, c, s);
i=n1;
ix=2*n1;
}
@ -247,7 +234,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
@ -273,4 +259,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}

View File

@ -29,229 +29,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
__asm__(
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%2 \n\t"
"vlrepg %%v24,0(%1) \n\t"
"vlrepg %%v25,8(%1) \n\t"
"pfd 1, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha_r] \n\t"
"vlvgp %%v24,%%r0,%%r0 \n\t"
"lgdr %%r0,%[alpha_i] \n\t"
"vlvgp %%v25,%%r0,%%r0 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%2 ) \n\t"
"vleg %%v20 , 0(%2),0 \n\t"
"vleg %%v21 , 8(%2),0 \n\t"
"vleg %%v20 , 16(%2),1 \n\t"
"vleg %%v21 , 24(%2),1 \n\t"
"vleg %%v22 , 32(%2),0 \n\t"
"vleg %%v23 , 40(%2),0 \n\t"
"vleg %%v22 , 48(%2),1 \n\t"
"vleg %%v23 , 56(%2),1 \n\t"
"pfd 2, 256(%[x_ptr] ) \n\t"
"vleg %%v20 , 0(%[x_ptr]),0 \n\t"
"vleg %%v21 , 8(%[x_ptr]),0 \n\t"
"vleg %%v20 , 16(%[x_ptr]),1 \n\t"
"vleg %%v21 , 24(%[x_ptr]),1 \n\t"
"vleg %%v22 , 32(%[x_ptr]),0 \n\t"
"vleg %%v23 , 40(%[x_ptr]),0 \n\t"
"vleg %%v22 , 48(%[x_ptr]),1 \n\t"
"vleg %%v23 , 56(%[x_ptr]),1 \n\t"
"vfmdb %%v16, %%v21, %%v25 \n\t"
"vfmdb %%v17, %%v20, %%v25 \n\t"
"vfmdb %%v18, %%v23, %%v25 \n\t"
"vfmdb %%v19, %%v22, %%v25 \n\t"
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
"vsteg %%v16 , 0(%2),0 \n\t"
"vsteg %%v17 , 8(%2),0 \n\t"
"vsteg %%v16 , 16(%2),1 \n\t"
"vsteg %%v17 , 24(%2),1 \n\t"
"vsteg %%v18 , 32(%2),0 \n\t"
"vsteg %%v19 , 40(%2),0 \n\t"
"vsteg %%v18 , 48(%2),1 \n\t"
"vsteg %%v19 , 56(%2),1 \n\t"
"vleg %%v20 , 64(%2),0 \n\t"
"vleg %%v21 , 72(%2),0 \n\t"
"vleg %%v20 , 80(%2),1 \n\t"
"vleg %%v21 , 88(%2),1 \n\t"
"vleg %%v22 , 96(%2),0 \n\t"
"vleg %%v23 , 104(%2),0 \n\t"
"vleg %%v22 , 112(%2),1 \n\t"
"vleg %%v23 , 120(%2),1 \n\t"
"vsteg %%v16 , 0(%[x_ptr]),0 \n\t"
"vsteg %%v17 , 8(%[x_ptr]),0 \n\t"
"vsteg %%v16 , 16(%[x_ptr]),1 \n\t"
"vsteg %%v17 , 24(%[x_ptr]),1 \n\t"
"vsteg %%v18 , 32(%[x_ptr]),0 \n\t"
"vsteg %%v19 , 40(%[x_ptr]),0 \n\t"
"vsteg %%v18 , 48(%[x_ptr]),1 \n\t"
"vsteg %%v19 , 56(%[x_ptr]),1 \n\t"
"vleg %%v20 , 64(%[x_ptr]),0 \n\t"
"vleg %%v21 , 72(%[x_ptr]),0 \n\t"
"vleg %%v20 , 80(%[x_ptr]),1 \n\t"
"vleg %%v21 , 88(%[x_ptr]),1 \n\t"
"vleg %%v22 , 96(%[x_ptr]),0 \n\t"
"vleg %%v23 , 104(%[x_ptr]),0 \n\t"
"vleg %%v22 , 112(%[x_ptr]),1 \n\t"
"vleg %%v23 , 120(%[x_ptr]),1 \n\t"
"vfmdb %%v16, %%v21, %%v25 \n\t"
"vfmdb %%v17, %%v20, %%v25 \n\t"
"vfmdb %%v18, %%v23, %%v25 \n\t"
"vfmdb %%v19, %%v22, %%v25 \n\t"
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
"vsteg %%v16 , 64(%[x_ptr]),0 \n\t"
"vsteg %%v17 , 72(%[x_ptr]),0 \n\t"
"vsteg %%v16 , 80(%[x_ptr]),1 \n\t"
"vsteg %%v17 , 88(%[x_ptr]),1 \n\t"
"vsteg %%v18 , 96(%[x_ptr]),0 \n\t"
"vsteg %%v19 , 104(%[x_ptr]),0 \n\t"
"vsteg %%v18 , 112(%[x_ptr]),1 \n\t"
"vsteg %%v19 , 120(%[x_ptr]),1 \n\t"
"vsteg %%v16 , 64(%2),0 \n\t"
"vsteg %%v17 , 72(%2),0 \n\t"
"vsteg %%v16 , 80(%2),1 \n\t"
"vsteg %%v17 , 88(%2),1 \n\t"
"vsteg %%v18 , 96(%2),0 \n\t"
"vsteg %%v19 , 104(%2),0 \n\t"
"vsteg %%v18 , 112(%2),1 \n\t"
"vsteg %%v19 , 120(%2),1 \n\t"
"la %2,128(%2) \n\t"
"clgrjl %2,%%r0,1b \n\t"
:
: "r"(n), "a"(alpha), "a"(x)
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"ld %%f0,8(%2) \n\t"
"lcdbr %%f1,%%f0 \n\t"
"lgdr %%r0,%%f1 \n\t"
"vlvgg %%v0,%%r0,1 \n\t"
"vlr %%v16,%%v0 \n\t"
"vlr %%v17 ,%%v0 \n\t"
"vlr %%v1,%%v0 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgg %%v16,%%r0,0 \n\t"
"lcdbr %[alpha],%[alpha] \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgg %%v16,%%r0,1 \n\t"
"vlr %%v17 ,%%v16 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%1) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vsteg %%v24, 0(%1),1 \n\t"
"vsteg %%v24, 8(%1),0 \n\t"
"vl %%v25, 16(%1) \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vsteg %%v25, 16(%1),1 \n\t"
"vsteg %%v25, 24(%1),0 \n\t"
"vl %%v26, 32(%1) \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v16 \n\t"
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v17 \n\t"
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vsteg %%v26, 32(%1),1 \n\t"
"vsteg %%v26, 40(%1),0 \n\t"
"vl %%v27, 48(%1) \n\t"
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vsteg %%v27, 40(%1),1 \n\t"
"vsteg %%v27, 48(%1),0 \n\t"
"vl %%v28, 64(%1) \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vsteg %%v28, 64(%1),1 \n\t"
"vsteg %%v28, 72(%1),0 \n\t"
"vl %%v29, 80(%1) \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vsteg %%v29, 80(%1),1 \n\t"
"vsteg %%v29, 88(%1),0 \n\t"
"vl %%v30, 96(%1) \n\t"
"vsteg %%v27, 40(%[x_ptr]),1 \n\t"
"vsteg %%v27, 48(%[x_ptr]),0 \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v16 \n\t"
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v17 \n\t"
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vsteg %%v27, 96(%1),1 \n\t"
"vsteg %%v27, 104(%1),0 \n\t"
"vl %%v31, 112(%1) \n\t"
"vsteg %%v27, 96(%[x_ptr]),1 \n\t"
"vsteg %%v27, 104(%[x_ptr]),0 \n\t"
"vl %%v31, 112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vsteg %%v31, 112(%1),1 \n\t"
"vsteg %%v31, 120(%1),0 \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x) ,"a"(alpha)
:"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_i)
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__ ("pfd 2, 0(%1) \n\t"
"vlrepg %%v18,0(%2) \n\t"
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v18,%%r0,%%r0 \n\t"
"vlr %%v19,%%v18 \n\t"
"vlr %%v16,%%v18 \n\t"
"vlr %%v17,%%v18 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%1) \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v18 \n\t"
"vst %%v24, 0(%1) \n\t"
"vl %%v25, 16(%1) \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v19 \n\t"
"vst %%v25, 16(%1) \n\t"
"vl %%v26, 32(%1) \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vst %%v26, 32(%1) \n\t"
"vl %%v27, 48(%1) \n\t"
"vst %%v26, 32(%[x_ptr]) \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vst %%v27, 48(%1) \n\t"
"vl %%v28, 64(%1) \n\t"
"vst %%v27, 48(%[x_ptr]) \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v18 \n\t"
"vst %%v28, 64(%1) \n\t"
"vl %%v29, 80(%1) \n\t"
"vst %%v28, 64(%[x_ptr]) \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v19 \n\t"
"vst %%v29, 80(%1) \n\t"
"vl %%v30, 96(%1) \n\t"
"vst %%v29, 80(%[x_ptr]) \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vst %%v30, 96(%1) \n\t"
"vl %%v31, 112(%1) \n\t"
"vst %%v30, 96(%[x_ptr]) \n\t"
"vl %%v31,112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vst %%v31, 112(%1) \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x) ,"a"(alpha)
:"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
"vst %%v31,112(%[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_r)
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256( %1) \n\t"
"vst %%v24, 0( %1) \n\t"
"vst %%v25, 16( %1) \n\t"
"vst %%v26, 32( %1) \n\t"
"vst %%v27, 48( %1) \n\t"
"vst %%v24, 64( %1) \n\t"
"vst %%v25, 80( %1) \n\t"
"vst %%v26, 96( %1) \n\t"
"vst %%v27,112( %1) \n\t"
"pfd 2, 256( %[x_ptr]) \n\t"
"vst %%v24, 0( %[x_ptr]) \n\t"
"vst %%v25, 16( %[x_ptr]) \n\t"
"vst %%v26, 32( %[x_ptr]) \n\t"
"vst %%v27, 48( %[x_ptr]) \n\t"
"vst %%v24, 64( %[x_ptr]) \n\t"
"vst %%v25, 80( %[x_ptr]) \n\t"
"vst %%v26, 96( %[x_ptr]) \n\t"
"vst %%v27,112( %[x_ptr]) \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x)
:"cc" , "memory" ,"r0","v24","v25","v26","v27"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" ,"r0","v24","v25","v26","v27"
);
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline));
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) {
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
@ -280,7 +269,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2];
if (inc_x != 1) {
inc_x <<= 1;
@ -373,9 +362,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
j = n1;
i = n1 * inc_x;
}
@ -401,19 +388,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
zscal_kernel_8_zero_r(n1, da_i, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
zscal_kernel_8_zero_i(n1, da_r, x);
else
zscal_kernel_8(n1, alpha, x);
zscal_kernel_8(n1, da_r,da_i, x);
i = n1 << 1;
j = n1;

View File

@ -29,99 +29,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
#if defined(Z13_SWAP_A)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,4 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#else
static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
:"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "memory", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif