Power8 inline assembly fixes

Quoting patch author amodra from #1078
Lots of issues here.
- The vsx regs weren't listed as clobbered.
- Poor choice of vsx regs, which along with the lack of clobbers led to
  trashing v0..v21 and fr14..fr23.  Ideally you'd let gcc choose all
  temp vsx regs, but asms currently have a limit of 30 i/o parms.
- Other regs were clobbered unnecessarily, seemingly in an attempt to
  clobber inputs, with gcc-7 complaining about the clobber of r2.
  (Changed inputs should be also listed as outputs or as an i/o.)
- "r" constraint used instead of "b" for gprs used in insns where the
  r0 encoding means zero rather than r0.
- There were unused asm inputs too.
- All memory was clobbered rather than hooking up memory outputs with
  proper memory constraints, and that and the lack of proper memory
  input constraints meant the asms needed to be volatile and their
  containing function noinline.
- Some parameters were being passed unnecessarily via memory.
- When a copy of a
This commit is contained in:
Martin Kroeker 2017-02-13 23:38:50 +01:00 committed by GitHub
parent e2489c9a92
commit 9e2f316ede
38 changed files with 2923 additions and 3249 deletions

View File

@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
} }
svec[0] = sum0+sum1+sum2+sum3; return sum0+sum1+sum2+sum3;
svec[1] = 0.0;
svec[2] = 0.0;
svec[3] = 0.0;
} }
#endif #endif
@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0; BLASLONG i=0;
BLASLONG ip=0; BLASLONG ip=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1; BLASLONG n1;
BLASLONG inc_x2; BLASLONG inc_x2;
@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 ) if ( n1 > 0 )
{ {
casum_kernel_16(n1, x, svec); sumf = casum_kernel_16(n1, x);
sumf = svec[0] + svec[1]+svec[2]+svec[3];
i=n1; i=n1;
ip = 2 * n1; ip = 2 * n1;
} }

View File

@ -34,113 +34,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) static float casum_kernel_16 (long n, float *x)
{ {
float sum;
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"dcbt %2 , %4 \n\t" "xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 32,32,32 \n\t" "xxlxor 34, 34, 34 \n\t"
"xxlxor 33,33,33 \n\t" "xxlxor 35, 35, 35 \n\t"
"xxlxor 34,34,34 \n\t" "xxlxor 36, 36, 36 \n\t"
"xxlxor 35,35,35 \n\t" "xxlxor 37, 37, 37 \n\t"
"xxlxor 36,36,36 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 37,37,37 \n\t" "xxlxor 39, 39, 39 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %8, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %10, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %12, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %8, %2 \n\t"
"xvabssp 52, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp 53, 45 \n\t" "xvabssp %x4, 45 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %10, %2 \n\t"
"xvabssp 54, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp 55, 47 \n\t" "xvabssp %x6, 47 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, 52 \n\t" "xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, 53 \n\t" "xvaddsp 37, 37, %x4 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"xvaddsp 38, 38, 54 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, 55 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"xvabssp 52, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp 53, 45 \n\t" "xvabssp %x4, 45 \n\t"
"xvabssp 54, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp 55, 47 \n\t" "xvabssp %x6, 47 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, 52 \n\t" "xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, 53 \n\t" "xvaddsp 37, 37, %x4 \n\t"
"xvaddsp 38, 38, 54 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, 55 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"xvaddsp 32, 32, 33 \n\t" "xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t" "xvaddsp 34, 34, 35 \n\t"
@ -152,26 +140,39 @@ static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
"xvaddsp 32, 32, 36 \n\t" "xvaddsp 32, 32, 36 \n\t"
"xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"stxvw4x 32, 0, %3 \n\t" "xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xscvspdp %0, 32 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
: :
"=f" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "b" (16), // 8
"r" (x1), // 2 "b" (32), // 9
"r" (svec), // 3 "b" (48), // 10
"r" (pre), // 4 "b" (64), // 11
"r" (o16), // 5 "b" (80), // 12
"r" (o32), // 6 "b" (96), // 13
"r" (o48), // 7 "b" (112) // 14
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2", "memory" "vs48","vs49","vs50","vs51"
); );
return sum;
} }

View File

@ -35,27 +35,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void ccopy_kernel_32 (long n, float *x, float *y)
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %5, %2 \n\t"
@ -68,107 +61,95 @@ static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"lxvw4x 50, 0, %2 \n\t" "addic. %1, %1, -32 \n\t"
"lxvw4x 51, %5, %2 \n\t"
"lxvw4x 52, %6, %2 \n\t"
"lxvw4x 53, %7, %2 \n\t"
"lxvw4x 54, %8, %2 \n\t"
"lxvw4x 55, %9, %2 \n\t"
"lxvw4x 56, %10, %2 \n\t"
"lxvw4x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvw4x 40, 0, %1 \n\t" "stxvw4x 32, 0, %3 \n\t"
"stxvw4x 41, %5, %1 \n\t" "stxvw4x 33, %5, %3 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %1 \n\t" "stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %1 \n\t" "stxvw4x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %1 \n\t" "stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %1 \n\t" "stxvw4x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %1 \n\t" "stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %1 \n\t" "stxvw4x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"stxvw4x 50, 0, %1 \n\t" "addic. %1, %1, -32 \n\t"
"stxvw4x 51, %5, %1 \n\t" "bgt 1b \n"
"lxvw4x 50, 0, %2 \n\t"
"lxvw4x 51, %5, %2 \n\t"
"stxvw4x 52, %6, %1 \n\t"
"stxvw4x 53, %7, %1 \n\t"
"lxvw4x 52, %6, %2 \n\t"
"lxvw4x 53, %7, %2 \n\t"
"stxvw4x 54, %8, %1 \n\t"
"stxvw4x 55, %9, %1 \n\t"
"lxvw4x 54, %8, %2 \n\t"
"lxvw4x 55, %9, %2 \n\t"
"stxvw4x 56, %10, %1 \n\t"
"stxvw4x 57, %11, %1 \n\t"
"lxvw4x 56, %10, %2 \n\t"
"lxvw4x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t" "2: \n\t"
"stxvw4x 40, 0, %1 \n\t" "stxvw4x 32, 0, %3 \n\t"
"stxvw4x 41, %5, %1 \n\t" "stxvw4x 33, %5, %3 \n\t"
"stxvw4x 42, %6, %1 \n\t" "stxvw4x 34, %6, %3 \n\t"
"stxvw4x 43, %7, %1 \n\t" "stxvw4x 35, %7, %3 \n\t"
"stxvw4x 44, %8, %1 \n\t" "stxvw4x 36, %8, %3 \n\t"
"stxvw4x 45, %9, %1 \n\t" "stxvw4x 37, %9, %3 \n\t"
"stxvw4x 46, %10, %1 \n\t" "stxvw4x 38, %10, %3 \n\t"
"stxvw4x 47, %11, %1 \n\t" "stxvw4x 39, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 50, 0, %1 \n\t"
"stxvw4x 51, %5, %1 \n\t"
"stxvw4x 52, %6, %1 \n\t"
"stxvw4x 53, %7, %1 \n\t"
"stxvw4x 54, %8, %1 \n\t"
"stxvw4x 55, %9, %1 \n\t"
"stxvw4x 56, %10, %1 \n\t"
"stxvw4x 57, %11, %1 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
: :
"r" (i), // 0 "m" (*x),
"r" (y1), // 1 "b" (16), // 5
"r" (x1), // 2 "b" (32), // 6
"r" (alpha), // 3 "b" (48), // 7
"r" (pre), // 4 "b" (64), // 8
"r" (o16), // 5 "b" (80), // 9
"r" (o32), // 6 "b" (96), // 10
"r" (o48), // 7 "b" (112) // 11
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }

View File

@ -35,79 +35,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void cswap_kernel_32 (long n, float *x, float *y)
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
".p2align 5 \n"
"addi %3, %3, -4 \n\t"
"addi %4, %4, -4 \n\t"
".align 5 \n\t"
"1: \n\t" "1: \n\t"
"lxvw4x 32, 0, %2 \n\t" "lxvw4x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %2 \n\t" "lxvw4x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %2 \n\t" "lxvw4x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %2 \n\t" "lxvw4x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %2 \n\t" "lxvw4x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %2 \n\t" "lxvw4x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %2 \n\t" "lxvw4x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %2 \n\t" "lxvw4x 39, %11, %4 \n\t"
"addi %2, %2, 128 \n\t" "addi %4, %4, 128 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %4 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %5, %4 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %6, %4 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %7, %4 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %8, %4 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %9, %4 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %10, %4 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %11, %4 \n\t"
"addi %2, %2, 128 \n\t" "addi %4, %4, -128 \n\t"
"lxvw4x 48, 0, %1 \n\t" "lxvw4x 48, 0, %3 \n\t"
"lxvw4x 49, %5, %1 \n\t" "lxvw4x 49, %5, %3 \n\t"
"lxvw4x 50, %6, %1 \n\t" "lxvw4x 50, %6, %3 \n\t"
"lxvw4x 51, %7, %1 \n\t" "lxvw4x 51, %7, %3 \n\t"
"lxvw4x 52, %8, %1 \n\t" "lxvw4x 0, %8, %3 \n\t"
"lxvw4x 53, %9, %1 \n\t" "lxvw4x 1, %9, %3 \n\t"
"lxvw4x 54, %10, %1 \n\t" "lxvw4x 2, %10, %3 \n\t"
"lxvw4x 55, %11, %1 \n\t" "lxvw4x 3, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, 128 \n\t"
"lxvw4x 56, 0, %1 \n\t" "lxvw4x 4, 0, %3 \n\t"
"lxvw4x 57, %5, %1 \n\t" "lxvw4x 5, %5, %3 \n\t"
"lxvw4x 58, %6, %1 \n\t" "lxvw4x 6, %6, %3 \n\t"
"lxvw4x 59, %7, %1 \n\t" "lxvw4x 7, %7, %3 \n\t"
"lxvw4x 60, %8, %1 \n\t" "lxvw4x 8, %8, %3 \n\t"
"lxvw4x 61, %9, %1 \n\t" "lxvw4x 9, %9, %3 \n\t"
"lxvw4x 62, %10, %1 \n\t" "lxvw4x 10, %10, %3 \n\t"
"lxvw4x 63, %11, %1 \n\t" "lxvw4x 11, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, -128 \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvw4x 33, %5, %3 \n\t"
@ -135,46 +112,47 @@ static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"stxvw4x 49, %5, %4 \n\t" "stxvw4x 49, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t" "stxvw4x 50, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t" "stxvw4x 51, %7, %4 \n\t"
"stxvw4x 52, %8, %4 \n\t" "stxvw4x 0, %8, %4 \n\t"
"stxvw4x 53, %9, %4 \n\t" "stxvw4x 1, %9, %4 \n\t"
"stxvw4x 54, %10, %4 \n\t" "stxvw4x 2, %10, %4 \n\t"
"stxvw4x 55, %11, %4 \n\t" "stxvw4x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"stxvw4x 56, 0, %4 \n\t" "stxvw4x 4, 0, %4 \n\t"
"stxvw4x 57, %5, %4 \n\t" "stxvw4x 5, %5, %4 \n\t"
"stxvw4x 58, %6, %4 \n\t" "stxvw4x 6, %6, %4 \n\t"
"stxvw4x 59, %7, %4 \n\t" "stxvw4x 7, %7, %4 \n\t"
"stxvw4x 60, %8, %4 \n\t" "stxvw4x 8, %8, %4 \n\t"
"stxvw4x 61, %9, %4 \n\t" "stxvw4x 9, %9, %4 \n\t"
"stxvw4x 62, %10, %4 \n\t" "stxvw4x 10, %10, %4 \n\t"
"stxvw4x 63, %11, %4 \n\t" "stxvw4x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
: :
"r" (i), // 0 "b" (16), // 5
"r" (y1), // 1 "b" (32), // 6
"r" (x1), // 2 "b" (48), // 7
"r" (y2), // 3 "b" (64), // 8
"r" (x2), // 4 "b" (80), // 9
"r" (o16), // 5 "b" (96), // 10
"r" (o32), // 6 "b" (112) // 11
"r" (o48), // 7 :
"r" (o64), // 8 "cr0",
"r" (o80), // 9 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o96), // 10 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"r" (o112) // 11 "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
); );
} }

View File

@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#define ABS fabsf #error supports double only
#endif #endif
@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -92,9 +92,7 @@ static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
} }
svec[0] = sum0+sum1+sum2+sum3; return sum0+sum1+sum2+sum3;
svec[1] = 0.0;
} }
#endif #endif
@ -103,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0) return(sumf);
@ -115,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 ) if ( n1 > 0 )
{ {
dasum_kernel_16(n1, x, svec); sumf = dasum_kernel_16(n1, x);
sumf = svec[0] + svec[1];
i=n1; i=n1;
} }

View File

@ -34,113 +34,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) static double dasum_kernel_16 (long n, double *x)
{ {
double sum;
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"dcbt %2 , %4 \n\t" "xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 32,32,32 \n\t" "xxlxor 34, 34, 34 \n\t"
"xxlxor 33,33,33 \n\t" "xxlxor 35, 35, 35 \n\t"
"xxlxor 34,34,34 \n\t" "xxlxor 36, 36, 36 \n\t"
"xxlxor 35,35,35 \n\t" "xxlxor 37, 37, 37 \n\t"
"xxlxor 36,36,36 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 37,37,37 \n\t" "xxlxor 39, 39, 39 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t" "xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t" "xvabsdp 51, 43 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"xvabsdp 52, 44 \n\t" "xvabsdp %x3, 44 \n\t"
"xvabsdp 53, 45 \n\t" "xvabsdp %x4, 45 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"xvabsdp 54, 46 \n\t" "xvabsdp %x5, 46 \n\t"
"xvabsdp 55, 47 \n\t" "xvabsdp %x6, 47 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"xvadddp 32, 32, 48 \n\t" "xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t" "xvadddp 33, 33, 49 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"xvadddp 34, 34, 50 \n\t" "xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t" "xvadddp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"xvadddp 36, 36, 52 \n\t" "xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, 53 \n\t" "xvadddp 37, 37, %x4 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"xvadddp 38, 38, 54 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, 55 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t" "xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t" "xvabsdp 51, 43 \n\t"
"xvabsdp 52, 44 \n\t" "xvabsdp %x3, 44 \n\t"
"xvabsdp 53, 45 \n\t" "xvabsdp %x4, 45 \n\t"
"xvabsdp 54, 46 \n\t" "xvabsdp %x5, 46 \n\t"
"xvabsdp 55, 47 \n\t" "xvabsdp %x6, 47 \n\t"
"xvadddp 32, 32, 48 \n\t" "xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t" "xvadddp 33, 33, 49 \n\t"
"xvadddp 34, 34, 50 \n\t" "xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t" "xvadddp 35, 35, 51 \n\t"
"xvadddp 36, 36, 52 \n\t" "xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, 53 \n\t" "xvadddp 37, 37, %x4 \n\t"
"xvadddp 38, 38, 54 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, 55 \n\t" "xvadddp 39, 39, %x6 \n\t"
"xvadddp 32, 32, 33 \n\t" "xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t" "xvadddp 34, 34, 35 \n\t"
@ -152,26 +140,36 @@ static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
"xsadddp %x0, 32, 33 \n"
"stxvd2x 32, 0, %3 \n\t" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
: :
"=d" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "b" (16), // 8
"r" (x1), // 2 "b" (32), // 9
"r" (svec), // 3 "b" (48), // 10
"r" (pre), // 4 "b" (64), // 11
"r" (o16), // 5 "b" (80), // 12
"r" (o32), // 6 "b" (96), // 13
"r" (o48), // 7 "b" (112) // 14
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2", "memory" "vs48","vs49","vs50","vs51"
); );
return sum;
} }

View File

@ -43,21 +43,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n) while(i < n)
{ {
y[i] += a * x[i]; y[i] += alpha * x[i];
y[i+1] += a * x[i+1]; y[i+1] += alpha * x[i+1];
y[i+2] += a * x[i+2]; y[i+2] += alpha * x[i+2];
y[i+3] += a * x[i+3]; y[i+3] += alpha * x[i+3];
y[i+4] += a * x[i+4]; y[i+4] += alpha * x[i+4];
y[i+5] += a * x[i+5]; y[i+5] += alpha * x[i+5];
y[i+6] += a * x[i+6]; y[i+6] += alpha * x[i+6];
y[i+7] += a * x[i+7]; y[i+7] += alpha * x[i+7];
i+=8 ; i+=8 ;
} }
@ -70,11 +69,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
FLOAT a2[4];
a2[0]=da;
a2[1]=da;
a2[2]=da;
a2[3]=da;
if ( n <= 0 ) return(0); if ( n <= 0 ) return(0);
@ -84,7 +78,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ) if ( n1 )
daxpy_kernel_8(n1, x, y , a2 ); daxpy_kernel_8(n1, x, y, da);
i = n1; i = n1;
while(i < n) while(i < n)

View File

@ -35,165 +35,181 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
{ {
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__vector double t6;
__vector double t7;
__vector double t8;
__vector double t9;
__vector double t10;
__vector double t11;
__vector double t12;
__vector double t13;
__vector double t14;
__vector double t15;
__vector double t16;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *y2=y+1;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"xxspltd %x4, %x22, 0 \n\t"
"lxsdx 33, %5, %4 \n\t" "dcbt 0, %2 \n\t"
"xxspltd 32, 33, 0 \n\t" "dcbt 0, %3 \n\t"
"addi %8, %8, -8 \n\t"
"dcbt %2, %9 \n\t" "lxvd2x %x5, 0, %2 \n\t"
"dcbt %3, %9 \n\t" "lxvd2x %x6, %23, %2 \n\t"
"lxvd2x %x7, %24, %2 \n\t"
"lxvd2x %x8, %25, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x %x13, 0, %3 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x %x14, %23, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x %x15, %24, %3 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x %x16, %25, %3 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t" "lxvd2x %x9, 0, %2 \n\t"
"lxvd2x 45, %5, %2 \n\t" "lxvd2x %x10, %23, %2 \n\t"
"lxvd2x 46, %6, %2 \n\t" "lxvd2x %x11, %24, %2 \n\t"
"lxvd2x 47, %7, %2 \n\t" "lxvd2x %x12, %25, %2 \n\t"
"lxvd2x 52, 0, %3 \n\t" "lxvd2x %x17, 0, %3 \n\t"
"lxvd2x 53, %5, %3 \n\t" "lxvd2x %x18, %23, %3 \n\t"
"lxvd2x 54, %6, %3 \n\t" "lxvd2x %x19, %24, %3 \n\t"
"lxvd2x 55, %7, %3 \n\t" "lxvd2x %x20, %25, %3 \n\t"
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %9 \n\t" "xvmaddadp %x13, %x5, %x4 \n\t"
"dcbt %3, %9 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t"
"xvmaddadp 48, 40, 32 \n\t" "lxvd2x %x5, 0, %2 \n\t"
"xvmaddadp 49, 41, 32 \n\t" "lxvd2x %x6, %23, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t" "stxvd2x %x13, 0, %3 \n\t"
"lxvd2x 41, %5, %2 \n\t" "stxvd2x %x14, %23, %3 \n\t"
"stxvd2x 48, 0, %8 \n\t" "xvmaddadp %x15, %x7, %x4 \n\t"
"stxvd2x 49, %5, %8 \n\t" "xvmaddadp %x16, %x8, %x4 \n\t"
"xvmaddadp 50, 42, 32 \n\t" "lxvd2x %x7, %24, %2 \n\t"
"xvmaddadp 51, 43, 32 \n\t" "lxvd2x %x8, %25, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t" "stxvd2x %x15, %24, %3 \n\t"
"lxvd2x 43, %7, %2 \n\t" "stxvd2x %x16, %25, %3 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %8, %8, 64 \n\t" "addi %3, %3, 128 \n\t"
"xvmaddadp 52, 44, 32 \n\t" "lxvd2x %x13, 0, %3 \n\t"
"addi %3, %3, 64 \n\t" "lxvd2x %x14, %23, %3 \n\t"
"xvmaddadp 53, 45, 32 \n\t" "lxvd2x %x15, %24, %3 \n\t"
"lxvd2x %x16, %25, %3 \n\t"
"lxvd2x 44, 0, %2 \n\t" "addi %3, %3, -64 \n\t"
"lxvd2x 45, %5, %2 \n\t"
"stxvd2x 52, 0, %8 \n\t" "xvmaddadp %x17, %x9, %x4 \n\t"
"stxvd2x 53, %5, %8 \n\t" "xvmaddadp %x18, %x10, %x4 \n\t"
"xvmaddadp 54, 46, 32 \n\t" "lxvd2x %x9, 0, %2 \n\t"
"xvmaddadp 55, 47, 32 \n\t" "lxvd2x %x10, %23, %2 \n\t"
"lxvd2x 46, %6, %2 \n\t" "stxvd2x %x17, 0, %3 \n\t"
"lxvd2x 47, %7, %2 \n\t" "stxvd2x %x18, %23, %3 \n\t"
"stxvd2x 54, %6, %8 \n\t" "xvmaddadp %x19, %x11, %x4 \n\t"
"stxvd2x 55, %7, %8 \n\t" "xvmaddadp %x20, %x12, %x4 \n\t"
"lxvd2x %x11, %24, %2 \n\t"
"lxvd2x %x12, %25, %2 \n\t"
"stxvd2x %x19, %24, %3 \n\t"
"stxvd2x %x20, %25, %3 \n\t"
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %8, %8, 64 \n\t" "addi %3, %3, 128 \n\t"
"lxvd2x 52, 0, %3 \n\t" "lxvd2x %x17, 0, %3 \n\t"
"lxvd2x 53, %5, %3 \n\t" "lxvd2x %x18, %23, %3 \n\t"
"lxvd2x 54, %6, %3 \n\t" "lxvd2x %x19, %24, %3 \n\t"
"lxvd2x 55, %7, %3 \n\t" "lxvd2x %x20, %25, %3 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t"
"addic. %0 , %0 , -16 \n\t" "bgt 1b \n"
"bgt 1b \n\t"
"2: \n\t" "2: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t"
"xvmaddadp %x15, %x7, %x4 \n\t"
"xvmaddadp %x16, %x8, %x4 \n\t"
"xvmaddadp 48, 40, 32 \n\t" "xvmaddadp %x17, %x9, %x4 \n\t"
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp %x18, %x10, %x4 \n\t"
"xvmaddadp 50, 42, 32 \n\t" "xvmaddadp %x19, %x11, %x4 \n\t"
"xvmaddadp 51, 43, 32 \n\t" "xvmaddadp %x20, %x12, %x4 \n\t"
"xvmaddadp 52, 44, 32 \n\t" "stxvd2x %x13, 0, %3 \n\t"
"xvmaddadp 53, 45, 32 \n\t" "stxvd2x %x14, %23, %3 \n\t"
"xvmaddadp 54, 46, 32 \n\t" "stxvd2x %x15, %24, %3 \n\t"
"xvmaddadp 55, 47, 32 \n\t" "stxvd2x %x16, %25, %3 \n\t"
"stxvd2x 48, 0, %8 \n\t" "addi %3, %3, 64 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"addi %8, %8, 64 \n\t" "stxvd2x %x17, 0, %3 \n\t"
"stxvd2x %x18, %23, %3 \n\t"
"stxvd2x 52, 0, %8 \n\t" "stxvd2x %x19, %24, %3 \n\t"
"stxvd2x 53, %5, %8 \n\t" "stxvd2x %x20, %25, %3 \n"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"#n=%1 x=%21=%2 y=%0=%3 alpha=%22 o16=%23 o32=%24 o48=%25\n"
"#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15 t12=%x16 t13=%x17 t14=%x18 t15=%x19 t16=%x20"
: :
"+m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y), // 3
"=wa" (t0), // 4
"=wa" (t1), // 5
"=wa" (t2), // 6
"=wa" (t3), // 7
"=wa" (t4), // 8
"=wa" (t5), // 9
"=wa" (t6), // 10
"=wa" (t7), // 11
"=wa" (t8), // 12
"=wa" (t9), // 13
"=wa" (t10), // 14
"=wa" (t11), // 15
"=wa" (t12), // 16
"=wa" (t13), // 17
"=wa" (t14), // 18
"=wa" (t15), // 19
"=wa" (t16) // 20
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "d" (alpha), // 22
"r" (x1), // 2 "b" (16), // 23
"r" (y1), // 3 "b" (32), // 24
"r" (alpha), // 4 "b" (48) // 25
"r" (o16), // 5 :
"r" (o32), // 6 "cr0"
"r" (o48), // 7
"r" (y2), // 8
"r" (pre) // 9
: "cr0", "%0", "%2" , "%3", "%8", "memory"
); );
} }

View File

@ -35,27 +35,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dcopy_kernel_32 (long n, double *x, double *y)
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
@ -68,107 +61,95 @@ static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"lxvd2x 50, 0, %2 \n\t" "addic. %1, %1, -32 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvd2x 40, 0, %1 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 41, %5, %1 \n\t" "stxvd2x 33, %5, %3 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %1 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %1 \n\t" "stxvd2x 43, %7, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %1 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %1 \n\t" "stxvd2x 45, %9, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %1 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %1 \n\t" "stxvd2x 47, %11, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t" "addic. %1, %1, -32 \n\t"
"stxvd2x 51, %5, %1 \n\t" "bgt 1b \n"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t" "2: \n\t"
"stxvd2x 40, 0, %1 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 41, %5, %1 \n\t" "stxvd2x 33, %5, %3 \n\t"
"stxvd2x 42, %6, %1 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvd2x 43, %7, %1 \n\t" "stxvd2x 35, %7, %3 \n\t"
"stxvd2x 44, %8, %1 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvd2x 45, %9, %1 \n\t" "stxvd2x 37, %9, %3 \n\t"
"stxvd2x 46, %10, %1 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvd2x 47, %11, %1 \n\t" "stxvd2x 39, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
: :
"r" (i), // 0 "m" (*x),
"r" (y1), // 1 "b" (16), // 5
"r" (x1), // 2 "b" (32), // 6
"r" (alpha), // 3 "b" (48), // 7
"r" (pre), // 4 "b" (64), // 8
"r" (o16), // 5 "b" (80), // 9
"r" (o32), // 6 "b" (96), // 10
"r" (o48), // 7 "b" (112) // 11
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }

View File

@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT dot = 0.0; FLOAT dot = 0.0;
@ -62,8 +62,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
i+=8 ; i+=8 ;
} }
*d += dot; return dot;
} }
#endif #endif
@ -83,7 +82,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ) if ( n1 )
ddot_kernel_8(n1, x, y , &dot ); dot = ddot_kernel_8(n1, x, y);
i = n1; i = n1;
while(i < n) while(i < n)

View File

@ -34,99 +34,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) static double ddot_kernel_8 (long n, double *x, double *y)
{ {
double dot;
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"xxlxor 32,32,32 \n\t" "dcbt 0, %2 \n\t"
"xxlxor 33,33,33 \n\t" "dcbt 0, %3 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %12 \n\t" "xxlxor 32, 32, 32 \n\t"
"dcbt %3, %12 \n\t" "xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t" "lxvd2x 48, 0, %3 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %10, %2 \n\t"
"lxvd2x 49, %5, %3 \n\t" "lxvd2x 49, %10, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %11, %2 \n\t"
"lxvd2x 50, %6, %3 \n\t" "lxvd2x 50, %11, %3 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %12, %2 \n\t"
"lxvd2x 51, %7, %3 \n\t" "lxvd2x 51, %12, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %13, %2 \n\t"
"lxvd2x 52, %8, %3 \n\t" "lxvd2x %x4, %13, %3 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %14, %2 \n\t"
"lxvd2x 53, %9, %3 \n\t" "lxvd2x %x5, %14, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %15, %2 \n\t"
"lxvd2x 54, %10, %3 \n\t" "lxvd2x %x6, %15, %3 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %16, %2 \n\t"
"lxvd2x 55, %11, %3 \n\t" "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t" "lxvd2x 48, 0, %3 \n\t"
"xvmaddadp 33, 41, 49 \n\t" "xvmaddadp 33, 41, 49 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %10, %2 \n\t"
"lxvd2x 49, %5, %3 \n\t" "lxvd2x 49, %10, %3 \n\t"
"xvmaddadp 34, 42, 50 \n\t" "xvmaddadp 34, 42, 50 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %11, %2 \n\t"
"lxvd2x 50, %6, %3 \n\t" "lxvd2x 50, %11, %3 \n\t"
"xvmaddadp 35, 43, 51 \n\t" "xvmaddadp 35, 43, 51 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %12, %2 \n\t"
"lxvd2x 51, %7, %3 \n\t" "lxvd2x 51, %12, %3 \n\t"
"xvmaddadp 36, 44, 52 \n\t" "xvmaddadp 36, 44, %x4 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %13, %2 \n\t"
"lxvd2x 52, %8, %3 \n\t" "lxvd2x %x4, %13, %3 \n\t"
"xvmaddadp 37, 45, 53 \n\t" "xvmaddadp 37, 45, %x5 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %14, %2 \n\t"
"lxvd2x 53, %9, %3 \n\t" "lxvd2x %x5, %14, %3 \n\t"
"xvmaddadp 38, 46, 54 \n\t" "xvmaddadp 38, 46, %x6 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %15, %2 \n\t"
"lxvd2x 54, %10, %3 \n\t" "lxvd2x %x6, %15, %3 \n\t"
"xvmaddadp 39, 47, 55 \n\t" "xvmaddadp 39, 47, %x7 \n\t"
"lxvd2x 47, %16, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x %x7, %16, %3 \n\t"
"lxvd2x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
@ -134,10 +120,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xvmaddadp 33, 41, 49 \n\t" "xvmaddadp 33, 41, 49 \n\t"
"xvmaddadp 34, 42, 50 \n\t" "xvmaddadp 34, 42, 50 \n\t"
"xvmaddadp 35, 43, 51 \n\t" "xvmaddadp 35, 43, 51 \n\t"
"xvmaddadp 36, 44, 52 \n\t" "xvmaddadp 36, 44, %x4 \n\t"
"xvmaddadp 37, 45, 53 \n\t" "xvmaddadp 37, 45, %x5 \n\t"
"xvmaddadp 38, 46, 54 \n\t" "xvmaddadp 38, 46, %x6 \n\t"
"xvmaddadp 39, 47, 55 \n\t" "xvmaddadp 39, 47, %x7 \n\t"
"xvadddp 32, 32, 33 \n\t" "xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t" "xvadddp 34, 34, 35 \n\t"
@ -151,28 +137,35 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xxswapd 33, 32 \n\t" "xxswapd 33, 32 \n\t"
"xsadddp 32, 32, 33 \n\t" "xsadddp %x0, 32, 33 \n"
"stxsdx 32, 0, %4 \n\t"
"#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
"#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
: :
"=d" (dot), // 0
"+r" (n), // 1
"+b" (x), // 2
"+b" (y), // 3
"=wa" (t0), // 4
"=wa" (t1), // 5
"=wa" (t2), // 6
"=wa" (t3) // 7
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "m" (*y),
"r" (x1), // 2 "b" (16), // 10
"r" (y1), // 3 "b" (32), // 11
"r" (dot), // 4 "b" (48), // 12
"r" (o16), // 5 "b" (64), // 13
"r" (o32), // 6 "b" (80), // 14
"r" (o48), // 7 "b" (96), // 15
"r" (o64), // 8 "b" (112) // 16
"r" (o80), // 9 :
"r" (o96), // 10 "cr0",
"r" (o112), // 11 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (pre) // 12 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2" , "%3", "memory" "vs48","vs49","vs50","vs51"
); );
return dot;
} }

View File

@ -47,18 +47,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_4x4 #ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
{ {
BLASLONG i; BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));; FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0]; FLOAT *a0 = a_ptr;
a1 = ap[1]; FLOAT *a1 = a0 + lda;
a2 = ap[2]; FLOAT *a2 = a1 + lda;
a3 = ap[3]; FLOAT *a3 = a2 + lda;
for ( i=0; i<4; i++) for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha; x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 ) for ( i=0; i< n; i+=4 )
{ {
@ -73,16 +73,13 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
#ifndef HAVE_KERNEL_4x2 #ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
{ {
BLASLONG i; BLASLONG i;
FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));; FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0];
a1 = ap[1];
for ( i=0; i<2; i++) for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha; x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 ) for ( i=0; i< n; i+=4 )
{ {
@ -98,15 +95,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
#ifndef HAVE_KERNEL_4x1 #ifndef HAVE_KERNEL_4x1
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
{ {
BLASLONG i; BLASLONG i;
FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));; FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap;
for ( i=0; i<1; i++) for ( i=0; i<1; i++)
x[i] = xo[i] * *alpha; x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 ) for ( i=0; i< n; i+=4 )
{ {
@ -141,7 +136,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
{ {
BLASLONG i; BLASLONG i;
BLASLONG j;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
FLOAT *y_ptr; FLOAT *y_ptr;
@ -151,13 +145,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG m3; BLASLONG m3;
BLASLONG n2; BLASLONG n2;
BLASLONG lda4 = lda << 2; BLASLONG lda4 = lda << 2;
FLOAT *ap[4] __attribute__ ((aligned (16)));;
FLOAT xbuffer[8] __attribute__ ((aligned (16)));; FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
FLOAT *ybuffer; FLOAT *ybuffer;
alpha_r[0] = alpha;
if ( m < 1 ) return(0); if ( m < 1 ) return(0);
if ( n < 1 ) return(0); if ( n < 1 ) return(0);
@ -187,11 +177,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 ) if ( inc_y != 1 )
memset(ybuffer,0,NB*8); memset(ybuffer,0,NB*8);
else else
@ -203,18 +188,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
for( i = 0; i < n1 ; i++) for( i = 0; i < n1 ; i++)
{ {
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r); dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4; a_ptr += lda4;
x_ptr += 4; x_ptr += 4;
} }
if ( n2 & 2 ) if ( n2 & 2 )
{ {
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r); dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
a_ptr += lda*2; a_ptr += lda*2;
x_ptr += 2; x_ptr += 2;
} }
@ -222,7 +203,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( n2 & 1 ) if ( n2 & 1 )
{ {
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r); dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
a_ptr += lda; a_ptr += lda;
x_ptr += 1; x_ptr += 1;
@ -243,11 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += inc_x; x_ptr += inc_x;
xbuffer[3] = x_ptr[0]; xbuffer[3] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r); dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4; a_ptr += lda4;
} }
@ -255,7 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
{ {
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r); dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
a_ptr += lda; a_ptr += lda;
} }

View File

@ -35,267 +35,264 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_4x4 1 #define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{ {
BLASLONG i=n; double *a0;
BLASLONG o8 = 8; double *a1;
BLASLONG o16 = 16; double *a2;
BLASLONG o24 = 24; double *a3;
BLASLONG pre = 384;
FLOAT *a0,*a1,*a2,*a3; __asm__
FLOAT *y1=y+1;
FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0]+1;
a1 = ap[1]+1;
a2 = ap[2]+1;
a3 = ap[3]+1;
x[0]=xo[0] * *alpha;
x[1]=xo[1] * *alpha;
x[2]=xo[2] * *alpha;
x[3]=xo[3] * *alpha;
__asm__ __volatile__
( (
"lxvdsx 32, 0 , %1 \n\t" // x0 "lxvd2x 34, 0, %9 \n\t" // x0, x1
"lxvdsx 33,%3 , %1 \n\t" // x1 "lxvd2x 35, %10, %9 \n\t" // x2, x3
"lxvdsx 34,%4 , %1 \n\t" // x2 "xxspltd 32, %x8, 0 \n\t" // alpha, alpha
"lxvdsx 35,%5 , %1 \n\t" // x3
"addi %2 , %2 , -8 \n\t"
"addi %6 , %6 , -8 \n\t"
"addi %7 , %7 , -8 \n\t"
"addi %8 , %8 , -8 \n\t"
"addi %9 , %9 , -8 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] "sldi %6, %4, 3 \n\t" // lda * sizeof (double)
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] "xvmuldp 34, 34, 32 \n\t" // x0 * alpha, x1 * alpha
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] "xvmuldp 35, 35, 32 \n\t" // x2 * alpha, x3 * alpha
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] "add %4, %3, %6 \n\t" // a1 = a0 + lda
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] "add %6, %6, %6 \n\t" // 2 * lda
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
"dcbt 0, %3 \n\t"
"dcbt 0, %4 \n\t"
"dcbt 0, %5 \n\t"
"dcbt 0, %6 \n\t"
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
"dcbt 0, %2 \n\t"
"addi %3, %3, 32 \n\t"
"addi %4, %4, 32 \n\t"
"addi %5, %5, 32 \n\t"
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"addi %7, %7, 32 \n\t"
"addi %8, %8, 32 \n\t"
"addi %9, %9, 32 \n\t"
"addic. %0 , %0 , -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %10 \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %10, %2 \n\t" // y2, y3
"lxvd2x 40, 0, %2 \n\t" // y0, y1 "xvmaddadp 36, 40, 32 \n\t"
"lxvd2x 41,%4, %2 \n\t" // y2, y3 "xvmaddadp 37, 41, 32 \n\t"
"dcbt %6, %10 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
"dcbt %7, %10 \n\t" "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
"dcbt %8, %10 \n\t"
"dcbt %9, %10 \n\t"
"xvmaddadp 40, 48, 32 \n\t" "xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 41, 49, 32 \n\t" "addi %3, %3, 32 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
"xvmaddadp 36, 44, 34 \n\t"
"addi %4, %4, 32 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
"xvmaddadp 36, 46, 35 \n\t"
"addi %5, %5, 32 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"stxvd2x 36, 0, %2 \n\t" // y0, y1
"stxvd2x 37, %10, %2 \n\t" // y2, y3
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3 "lxvd2x 37, %10, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t" "xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
"xvmaddadp 36, 42, 33 \n\t"
"addi %3, %3, 32 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
"xvmaddadp 36, 44, 34 \n\t"
"addi %4, %4, 32 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
"xvmaddadp 36, 46, 35 \n\t"
"addi %5, %5, 32 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"stxvd2x 36, 0, %2 \n\t" // y0, y1
"stxvd2x 37, %10, %2 \n\t" // y2, y3
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3 "lxvd2x 37, %10, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t" "xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
"xvmaddadp 36, 42, 33 \n\t"
"addi %3, %3, 32 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
"xvmaddadp 36, 44, 34 \n\t"
"addi %4, %4, 32 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
"xvmaddadp 36, 46, 35 \n\t"
"addi %5, %5, 32 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"stxvd2x 36, 0, %2 \n\t" // y0, y1
"stxvd2x 37, %10, %2 \n\t" // y2, y3
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3 "lxvd2x 37, %10, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t" "xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
"xvmaddadp 36, 42, 33 \n\t"
"addi %3, %3, 32 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
"xvmaddadp 36, 44, 34 \n\t"
"addi %4, %4, 32 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
"xvmaddadp 36, 46, 35 \n\t"
"addi %5, %5, 32 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"stxvd2x 36, 0, %2 \n\t" // y0, y1
"stxvd2x 37, %10, %2 \n\t" // y2, y3
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t" "addic. %1, %1, -4 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3 "lxvd2x 37, %10, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t" "xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t"
"xvmaddadp 40, 50, 33 \n\t" "xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 41, 51, 33 \n\t" "xvmaddadp 37, 43, 33 \n\t"
"xvmaddadp 40, 52, 34 \n\t" "xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 41, 53, 34 \n\t" "xvmaddadp 37, 45, 34 \n\t"
"xvmaddadp 40, 54, 35 \n\t" "xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 41, 55, 35 \n\t" "xvmaddadp 37, 47, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1 "stxvd2x 36, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3 "stxvd2x 37, %10, %2 \n" // y2, y3
"#n=%1 ap=%11 lda=%12 x=%7=%9 y=%0=%2 alpha=%8 o16=%10\n"
"#a0=%3 a1=%4 a2=%5 a3=%6"
: :
"=m" (*y),
"+r" (n), // 1
"+b" (y), // 2
"=b" (a0), // 3
"=b" (a1), // 4
"=&b" (a2), // 5
"=&b" (a3) // 6
: :
"r" (i), // 0 "m" (*x),
"r" (x), // 1 "d" (alpha), // 8
"r" (y1), // 2 "r" (x), // 9
"r" (o8), // 3 "b" (16), // 10
"r" (o16), // 4 "3" (ap), // 11
"r" (o24), // 5 "4" (lda) // 12
"r" (a0), // 6 :
"r" (a1), // 7 "cr0",
"r" (a2), // 8 "vs32","vs33","vs34","vs35","vs36","vs37",
"r" (a3), // 9 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
"r" (pre) // 10
: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
); );
} }

View File

@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -56,8 +56,6 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
FLOAT y00, y01, y02, y03; FLOAT y00, y01, y02, y03;
FLOAT *x1=x; FLOAT *x1=x;
FLOAT *y1=y; FLOAT *y1=y;
FLOAT c1=*c;
FLOAT s1=*s;
while ( i<n ) while ( i<n )
{ {
@ -71,14 +69,14 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
x03 = x1[3]; x03 = x1[3];
y03 = y1[3]; y03 = y1[3];
f0 = c1*x00 + s1*y00; f0 = c*x00 + s*y00;
g0 = c1*y00 - s1*x00; g0 = c*y00 - s*x00;
f1 = c1*x01 + s1*y01; f1 = c*x01 + s*y01;
g1 = c1*y01 - s1*x01; g1 = c*y01 - s*x01;
f2 = c1*x02 + s1*y02; f2 = c*x02 + s*y02;
g2 = c1*y02 - s1*x02; g2 = c*y02 - s*x02;
f3 = c1*x03 + s1*y03; f3 = c*x03 + s*y03;
g3 = c1*y03 - s1*x03; g3 = c*y03 - s*x03;
x1[0] = f0; x1[0] = f0;
y1[0] = g0; y1[0] = g0;
@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
FLOAT c1[4] __attribute__ ((aligned (16)));;
FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x; FLOAT *x1=x;
FLOAT *y1=y; FLOAT *y1=y;
FLOAT temp; FLOAT temp;
@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
c1[0]=c; drot_kernel_16(n1, x1, y1, c, s);
c1[1]=c;
c1[2]=c;
c1[3]=c;
s1[0]=s;
s1[1]=s;
s1[2]=s;
s1[3]=s;
drot_kernel_16(n1, x1, y1, c1, s1);
i=n1; i=n1;
} }

View File

@ -38,174 +38,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{ {
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__vector double t6;
__vector double t7;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
__asm__ __volatile__
( (
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords
"lxsdx 36 , %5, %3 \n\t" // load c "lxvd2x 32, 0, %3 \n\t" // load x
"lxsdx 37 , %5, %4 \n\t" // load s "lxvd2x 33, %15, %3 \n\t"
"addi %8 , %8, -8 \n\t" "lxvd2x 34, %16, %3 \n\t"
"addi %9 , %9, -8 \n\t" "lxvd2x 35, %17, %3 \n\t"
"xxspltd 36 , 36, 0 \n\t" "lxvd2x 48, 0, %4 \n\t" // load y
"xxspltd 37 , 37, 0 \n\t" "lxvd2x 49, %15, %4 \n\t"
"lxvd2x 50, %16, %4 \n\t"
"lxvd2x 51, %17, %4 \n\t"
"lxvd2x 32, 0, %1 \n\t" // load x "addi %3, %3, 64 \n\t"
"lxvd2x 33, %5, %1 \n\t" "addi %4, %4, 64 \n\t"
"lxvd2x 34, %6, %1 \n\t"
"lxvd2x 35, %7, %1 \n\t"
"lxvd2x 40, 0, %2 \n\t" // load y "addic. %2, %2, -8 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"addi %1, %1, 64 \n\t"
"addi %2, %2, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"xvmuldp 48, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 49, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
"xvmuldp 50, 34, 36 \n\t" "xvmuldp 42, 34, 36 \n\t"
"xvmuldp 51, 35, 36 \n\t" "xvmuldp 43, 35, 36 \n\t"
"xvmuldp 56, 40, 36 \n\t" // c * y "xvmuldp %x5, 48, 36 \n\t" // c * y
"xvmuldp 57, 41, 36 \n\t" "xvmuldp %x6, 49, 36 \n\t"
"xvmuldp 58, 42, 36 \n\t" "xvmuldp %x7, 50, 36 \n\t"
"xvmuldp 59, 43, 36 \n\t" "xvmuldp %x8, 51, 36 \n\t"
"xvmuldp 52, 32, 37 \n\t" // s * x "xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 53, 33, 37 \n\t" "xvmuldp 45, 33, 37 \n\t"
"lxvd2x 32, 0, %1 \n\t" // load x "lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %5, %1 \n\t" "lxvd2x 33, %15, %3 \n\t"
"xvmuldp 54, 34, 37 \n\t" "xvmuldp 46, 34, 37 \n\t"
"xvmuldp 55, 35, 37 \n\t" "xvmuldp 47, 35, 37 \n\t"
"lxvd2x 34, %6, %1 \n\t" "lxvd2x 34, %16, %3 \n\t"
"lxvd2x 35, %7, %1 \n\t" "lxvd2x 35, %17, %3 \n\t"
"xvmuldp 60, 40, 37 \n\t" // s * y "xvmuldp %x9, 48, 37 \n\t" // s * y
"xvmuldp 61, 41, 37 \n\t" "xvmuldp %x10, 49, 37 \n\t"
"lxvd2x 40, 0, %2 \n\t" // load y "lxvd2x 48, 0, %4 \n\t" // load y
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 49, %15, %4 \n\t"
"xvmuldp 62, 42, 37 \n\t" "xvmuldp %x11, 50, 37 \n\t"
"xvmuldp 63, 43, 37 \n\t" "xvmuldp %x12, 51, 37 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 50, %16, %4 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 51, %17, %4 \n\t"
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y "xvadddp 40, 40, %x9 \n\t" // c * x + s * y
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y "xvadddp 41, 41, %x10 \n\t" // c * x + s * y
"addi %1, %1, 64 \n\t" "addi %3, %3, -64 \n\t"
"addi %2, %2, 64 \n\t" "addi %4, %4, -64 \n\t"
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y "xvadddp 42, 42, %x11 \n\t" // c * x + s * y
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y "xvadddp 43, 43, %x12 \n\t" // c * x + s * y
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvd2x 48, 0, %8 \n\t" // store x "stxvd2x 40, 0, %3 \n\t" // store x
"stxvd2x 49, %5, %8 \n\t" "stxvd2x 41, %15, %3 \n\t"
"stxvd2x 50, %6, %8 \n\t" "stxvd2x 42, %16, %3 \n\t"
"stxvd2x 51, %7, %8 \n\t" "stxvd2x 43, %17, %3 \n\t"
"stxvd2x 56, 0, %9 \n\t" // store y "stxvd2x %x5, 0, %4 \n\t" // store y
"stxvd2x 57, %5, %9 \n\t" "stxvd2x %x6, %15, %4 \n\t"
"stxvd2x 58, %6, %9 \n\t" "stxvd2x %x7, %16, %4 \n\t"
"stxvd2x 59, %7, %9 \n\t" "stxvd2x %x8, %17, %4 \n\t"
"addi %8, %8, 64 \n\t" "addi %3, %3, 128 \n\t"
"addi %9, %9, 64 \n\t" "addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %2, %2, -8 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvmuldp 48, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 49, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
"xvmuldp 50, 34, 36 \n\t" "xvmuldp 42, 34, 36 \n\t"
"xvmuldp 51, 35, 36 \n\t" "xvmuldp 43, 35, 36 \n\t"
"xvmuldp 56, 40, 36 \n\t" // c * y "xvmuldp %x5, 48, 36 \n\t" // c * y
"xvmuldp 57, 41, 36 \n\t" "xvmuldp %x6, 49, 36 \n\t"
"xvmuldp 58, 42, 36 \n\t" "xvmuldp %x7, 50, 36 \n\t"
"xvmuldp 59, 43, 36 \n\t" "xvmuldp %x8, 51, 36 \n\t"
"xvmuldp 52, 32, 37 \n\t" // s * x "xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 53, 33, 37 \n\t" "xvmuldp 45, 33, 37 \n\t"
"xvmuldp 54, 34, 37 \n\t" "xvmuldp 46, 34, 37 \n\t"
"xvmuldp 55, 35, 37 \n\t" "xvmuldp 47, 35, 37 \n\t"
"xvmuldp 60, 40, 37 \n\t" // s * y "xvmuldp %x9, 48, 37 \n\t" // s * y
"xvmuldp 61, 41, 37 \n\t" "xvmuldp %x10, 49, 37 \n\t"
"xvmuldp 62, 42, 37 \n\t" "xvmuldp %x11, 50, 37 \n\t"
"xvmuldp 63, 43, 37 \n\t" "xvmuldp %x12, 51, 37 \n\t"
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y "addi %3, %3, -64 \n\t"
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y "addi %4, %4, -64 \n\t"
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x "xvadddp 40, 40, %x9 \n\t" // c * x + s * y
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x "xvadddp 41, 41, %x10 \n\t" // c * x + s * y
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x "xvadddp 42, 42, %x11 \n\t" // c * x + s * y
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x "xvadddp 43, 43, %x12 \n\t" // c * x + s * y
"stxvd2x 48, 0, %8 \n\t" // store x "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x
"stxvd2x 49, %5, %8 \n\t" "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x
"stxvd2x 50, %6, %8 \n\t" "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x
"stxvd2x 51, %7, %8 \n\t" "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvd2x 56, 0, %9 \n\t" // store y
"stxvd2x 57, %5, %9 \n\t"
"stxvd2x 58, %6, %9 \n\t"
"stxvd2x 59, %7, %9 \n\t"
"stxvd2x 40, 0, %3 \n\t" // store x
"stxvd2x 41, %15, %3 \n\t"
"stxvd2x 42, %16, %3 \n\t"
"stxvd2x 43, %17, %3 \n\t"
"stxvd2x %x5, 0, %4 \n\t" // store y
"stxvd2x %x6, %15, %4 \n\t"
"stxvd2x %x7, %16, %4 \n\t"
"stxvd2x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y), // 4
"=wa" (t0), // 5
"=wa" (t1), // 6
"=wa" (t2), // 7
"=wa" (t3), // 8
"=wa" (t4), // 9
"=wa" (t5), // 10
"=wa" (t6), // 11
"=wa" (t7) // 12
: :
"r" (i), // 0 "d" (c), // 13
"r" (x1), // 1 "d" (s), // 14
"r" (y1), // 2 "b" (16), // 15
"r" (c), // 3 "b" (32), // 16
"r" (s), // 4 "b" (48) // 17
"r" (o16), // 5 :
"r" (o32), // 6 "cr0",
"r" (o48), // 7 "vs32","vs33","vs34","vs35","vs36","vs37",
"r" (x2), // 8 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"r" (y2) // 9 "vs48","vs49","vs50","vs51"
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
); );
} }

View File

@ -41,11 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(HAVE_KERNEL_8) #if !defined(HAVE_KERNEL_8)
static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) static void dscal_kernel_8 (BLASLONG n, FLOAT *x, FLOAT alpha)
{ {
BLASLONG i; BLASLONG i;
FLOAT alpha = *da;
for( i=0; i<n; i+=8 ) for( i=0; i<n; i+=8 )
{ {
@ -62,7 +61,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
} }
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x ) static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
{ {
BLASLONG i; BLASLONG i;
@ -102,10 +101,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
FLOAT alpha[2]; dscal_kernel_8_zero(n1, x);
alpha[0]=da;
alpha[1]=da;
dscal_kernel_8_zero(n1 , alpha , x);
j=n1; j=n1;
} }
@ -123,10 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
FLOAT alpha[2]; dscal_kernel_8(n1, x, da);
alpha[0]=da;
alpha[1]=da;
dscal_kernel_8(n1 , alpha , x);
j=n1; j=n1;
} }
while(j < n) while(j < n)

View File

@ -35,185 +35,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8 (long n, double *x, double alpha)
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"lxsdx 33, 0, %3 \n\t" "xxspltd %x3, %x3, 0 \n\t"
"xxspltd 32, 33, 0 \n\t"
"addi %1, %1, -8 \n\t"
"dcbt %2, %4 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 34, %5, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 35, %6, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 36, %7, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 37, %8, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 38, %9, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 39, %10, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %4 \n\t" "xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
"xvmuldp 42, 34, %x3 \n\t"
"xvmuldp 43, 35, %x3 \n\t"
"lxvd2x 34, %5, %2 \n\t"
"lxvd2x 35, %6, %2 \n\t"
"xvmuldp 44, 36, %x3 \n\t"
"xvmuldp 45, 37, %x3 \n\t"
"lxvd2x 36, %7, %2 \n\t"
"lxvd2x 37, %8, %2 \n\t"
"xvmuldp 46, 38, %x3 \n\t"
"xvmuldp 47, 39, %x3 \n\t"
"lxvd2x 38, %9, %2 \n\t"
"lxvd2x 39, %10, %2 \n\t"
"xvmuldp 48, 40, 32 \n\t" "addi %2, %2, -128 \n\t"
"xvmuldp 49, 41, 32 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"stxvd2x 48, 0, %1 \n\t" "stxvd2x 40, 0, %2 \n\t"
"stxvd2x 49, %5, %1 \n\t" "stxvd2x 41, %4, %2 \n\t"
"stxvd2x 50, %6, %1 \n\t" "stxvd2x 42, %5, %2 \n\t"
"stxvd2x 51, %7, %1 \n\t" "stxvd2x 43, %6, %2 \n\t"
"stxvd2x 52, %8, %1 \n\t" "stxvd2x 44, %7, %2 \n\t"
"stxvd2x 53, %9, %1 \n\t" "stxvd2x 45, %8, %2 \n\t"
"stxvd2x 54, %10, %1 \n\t" "stxvd2x 46, %9, %2 \n\t"
"stxvd2x 55, %11, %1 \n\t" "stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"2: \n\t"
"xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t"
"xvmuldp 42, 34, %x3 \n\t"
"xvmuldp 43, 35, %x3 \n\t"
"addi %2, %2, -128 \n\t"
"xvmuldp 44, 36, %x3 \n\t"
"xvmuldp 45, 37, %x3 \n\t"
"xvmuldp 46, 38, %x3 \n\t"
"xvmuldp 47, 39, %x3 \n\t"
"stxvd2x 40, 0, %2 \n\t"
"stxvd2x 41, %4, %2 \n\t"
"stxvd2x 42, %5, %2 \n\t"
"stxvd2x 43, %6, %2 \n\t"
"stxvd2x 44, %7, %2 \n\t"
"stxvd2x 45, %8, %2 \n\t"
"stxvd2x 46, %9, %2 \n\t"
"stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
"d" (alpha), // 3
"b" (16), // 4
"b" (32), // 5
"b" (48), // 6
"b" (64), // 7
"b" (80), // 8
"b" (96), // 9
"b" (112) // 10
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}
static void dscal_kernel_8_zero (long n, double *x)
{
__vector double t0;
__asm__
(
"xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n"
"1: \n\t"
"stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t"
"stxvd2x %x3, %5, %2 \n\t"
"stxvd2x %x3, %6, %2 \n\t"
"stxvd2x %x3, %7, %2 \n\t"
"stxvd2x %x3, %8, %2 \n\t"
"stxvd2x %x3, %9, %2 \n\t"
"stxvd2x %x3, %10, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t"
"xvmuldp 48, 40, 32 \n\t"
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"stxvd2x 48, 0, %1 \n\t"
"stxvd2x 49, %5, %1 \n\t"
"stxvd2x 50, %6, %1 \n\t"
"stxvd2x 51, %7, %1 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :
"=m" (*x),
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0) // 3
: :
"r" (i), // 0 "b" (16), // 4
"r" (x2), // 1 "b" (32), // 5
"r" (x1), // 2 "b" (48), // 6
"r" (alpha), // 3 "b" (64), // 7
"r" (pre), // 4 "b" (80), // 8
"r" (o16), // 5 "b" (96), // 9
"r" (o32), // 6 "b" (112) // 10
"r" (o48), // 7 :
"r" (o64), // 8 "cr0"
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32 , 32 , 32 \n\t"
"addi %1, %1, -8 \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvd2x 32, 0, %1 \n\t"
"stxvd2x 32, %5, %1 \n\t"
"stxvd2x 32, %6, %1 \n\t"
"stxvd2x 32, %7, %1 \n\t"
"stxvd2x 32, %8, %1 \n\t"
"stxvd2x 32, %9, %1 \n\t"
"stxvd2x 32, %10, %1 \n\t"
"stxvd2x 32, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

View File

@ -35,79 +35,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dswap_kernel_32 (long n, double *x, double *y)
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
".p2align 5 \n"
"addi %3, %3, -8 \n\t"
"addi %4, %4, -8 \n\t"
".align 5 \n\t"
"1: \n\t" "1: \n\t"
"lxvd2x 32, 0, %2 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %2 \n\t" "lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %2 \n\t" "lxvd2x 34, %6, %4 \n\t"
"lxvd2x 35, %7, %2 \n\t" "lxvd2x 35, %7, %4 \n\t"
"lxvd2x 36, %8, %2 \n\t" "lxvd2x 36, %8, %4 \n\t"
"lxvd2x 37, %9, %2 \n\t" "lxvd2x 37, %9, %4 \n\t"
"lxvd2x 38, %10, %2 \n\t" "lxvd2x 38, %10, %4 \n\t"
"lxvd2x 39, %11, %2 \n\t" "lxvd2x 39, %11, %4 \n\t"
"addi %2, %2, 128 \n\t" "addi %4, %4, 128 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %4 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %5, %4 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %6, %4 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %7, %4 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %8, %4 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %9, %4 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %10, %4 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %11, %4 \n\t"
"addi %2, %2, 128 \n\t" "addi %4, %4, -128 \n\t"
"lxvd2x 48, 0, %1 \n\t" "lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %1 \n\t" "lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %1 \n\t" "lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %1 \n\t" "lxvd2x 51, %7, %3 \n\t"
"lxvd2x 52, %8, %1 \n\t" "lxvd2x 0, %8, %3 \n\t"
"lxvd2x 53, %9, %1 \n\t" "lxvd2x 1, %9, %3 \n\t"
"lxvd2x 54, %10, %1 \n\t" "lxvd2x 2, %10, %3 \n\t"
"lxvd2x 55, %11, %1 \n\t" "lxvd2x 3, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, 128 \n\t"
"lxvd2x 56, 0, %1 \n\t" "lxvd2x 4, 0, %3 \n\t"
"lxvd2x 57, %5, %1 \n\t" "lxvd2x 5, %5, %3 \n\t"
"lxvd2x 58, %6, %1 \n\t" "lxvd2x 6, %6, %3 \n\t"
"lxvd2x 59, %7, %1 \n\t" "lxvd2x 7, %7, %3 \n\t"
"lxvd2x 60, %8, %1 \n\t" "lxvd2x 8, %8, %3 \n\t"
"lxvd2x 61, %9, %1 \n\t" "lxvd2x 9, %9, %3 \n\t"
"lxvd2x 62, %10, %1 \n\t" "lxvd2x 10, %10, %3 \n\t"
"lxvd2x 63, %11, %1 \n\t" "lxvd2x 11, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, -128 \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -135,46 +112,47 @@ static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"stxvd2x 49, %5, %4 \n\t" "stxvd2x 49, %5, %4 \n\t"
"stxvd2x 50, %6, %4 \n\t" "stxvd2x 50, %6, %4 \n\t"
"stxvd2x 51, %7, %4 \n\t" "stxvd2x 51, %7, %4 \n\t"
"stxvd2x 52, %8, %4 \n\t" "stxvd2x 0, %8, %4 \n\t"
"stxvd2x 53, %9, %4 \n\t" "stxvd2x 1, %9, %4 \n\t"
"stxvd2x 54, %10, %4 \n\t" "stxvd2x 2, %10, %4 \n\t"
"stxvd2x 55, %11, %4 \n\t" "stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"stxvd2x 56, 0, %4 \n\t" "stxvd2x 4, 0, %4 \n\t"
"stxvd2x 57, %5, %4 \n\t" "stxvd2x 5, %5, %4 \n\t"
"stxvd2x 58, %6, %4 \n\t" "stxvd2x 6, %6, %4 \n\t"
"stxvd2x 59, %7, %4 \n\t" "stxvd2x 7, %7, %4 \n\t"
"stxvd2x 60, %8, %4 \n\t" "stxvd2x 8, %8, %4 \n\t"
"stxvd2x 61, %9, %4 \n\t" "stxvd2x 9, %9, %4 \n\t"
"stxvd2x 62, %10, %4 \n\t" "stxvd2x 10, %10, %4 \n\t"
"stxvd2x 63, %11, %4 \n\t" "stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
: :
"r" (i), // 0 "b" (16), // 5
"r" (y1), // 1 "b" (32), // 6
"r" (x1), // 2 "b" (48), // 7
"r" (y2), // 3 "b" (64), // 8
"r" (x2), // 4 "b" (80), // 9
"r" (o16), // 5 "b" (96), // 10
"r" (o32), // 6 "b" (112) // 11
"r" (o48), // 7 :
"r" (o64), // 8 "cr0",
"r" (o80), // 9 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o96), // 10 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"r" (o112) // 11 "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
); );
} }

View File

@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE) #if defined(DOUBLE)
#define ABS fabs #error supports float only
#else #else
@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32
static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec) static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -92,11 +92,7 @@ static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
} }
svec[0] = sum0+sum1+sum2+sum3; return sum0+sum1+sum2+sum3;
svec[1] = 0.0;
svec[2] = 0.0;
svec[3] = 0.0;
} }
#endif #endif
@ -105,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0) return(sumf);
@ -117,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 ) if ( n1 > 0 )
{ {
sasum_kernel_32(n1, x, svec); sumf = sasum_kernel_32(n1, x);
sumf = svec[0] + svec[1]+svec[2]+svec[3];
i=n1; i=n1;
} }

View File

@ -34,113 +34,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) static float sasum_kernel_32 (long n, float *x)
{ {
float sum;
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"dcbt %2 , %4 \n\t" "xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 32,32,32 \n\t" "xxlxor 34, 34, 34 \n\t"
"xxlxor 33,33,33 \n\t" "xxlxor 35, 35, 35 \n\t"
"xxlxor 34,34,34 \n\t" "xxlxor 36, 36, 36 \n\t"
"xxlxor 35,35,35 \n\t" "xxlxor 37, 37, 37 \n\t"
"xxlxor 36,36,36 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 37,37,37 \n\t" "xxlxor 39, 39, 39 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %8, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %10, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %12, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %8, %2 \n\t"
"xvabssp 52, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp 53, 45 \n\t" "xvabssp %x4, 45 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %10, %2 \n\t"
"xvabssp 54, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp 55, 47 \n\t" "xvabssp %x6, 47 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, 52 \n\t" "xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, 53 \n\t" "xvaddsp 37, 37, %x4 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"xvaddsp 38, 38, 54 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, 55 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"xvabssp 52, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp 53, 45 \n\t" "xvabssp %x4, 45 \n\t"
"xvabssp 54, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp 55, 47 \n\t" "xvabssp %x6, 47 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, 52 \n\t" "xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, 53 \n\t" "xvaddsp 37, 37, %x4 \n\t"
"xvaddsp 38, 38, 54 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, 55 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"xvaddsp 32, 32, 33 \n\t" "xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t" "xvaddsp 34, 34, 35 \n\t"
@ -152,26 +140,39 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
"xvaddsp 32, 32, 36 \n\t" "xvaddsp 32, 32, 36 \n\t"
"xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"stxvw4x 32, 0, %3 \n\t" "xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xscvspdp %0, 32 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
: :
"=f" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "b" (16), // 8
"r" (x1), // 2 "b" (32), // 9
"r" (svec), // 3 "b" (48), // 10
"r" (pre), // 4 "b" (64), // 11
"r" (o16), // 5 "b" (80), // 12
"r" (o32), // 6 "b" (96), // 13
"r" (o48), // 7 "b" (112) // 14
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2", "memory" "vs48","vs49","vs50","vs51"
); );
return sum;
} }

View File

@ -35,28 +35,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void scopy_kernel_32 (long n, float *x, float *y)
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %6, %2 \n\t"
@ -68,64 +50,63 @@ static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvw4x 40, 0, %1 \n\t" "stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %1 \n\t" "stxvw4x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %1 \n\t" "stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %1 \n\t" "stxvw4x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %1 \n\t" "stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %1 \n\t" "stxvw4x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %1 \n\t" "stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %1 \n\t" "stxvw4x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"stxvw4x 40, 0, %1 \n\t" "stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %1 \n\t" "stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %1 \n\t" "stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %1 \n\t" "stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %1 \n\t" "stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %1 \n\t" "stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %1 \n\t" "stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %1 \n\t" "stxvw4x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
: :
"r" (i), // 0 "m" (*x),
"r" (y1), // 1 "b" (16), // 5
"r" (x1), // 2 "b" (32), // 6
"r" (alpha), // 3 "b" (48), // 7
"r" (pre), // 4 "b" (64), // 8
"r" (o16), // 5 "b" (80), // 9
"r" (o32), // 6 "b" (96), // 10
"r" (o48), // 7 "b" (112) // 11
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }

View File

@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT dot = 0.0; FLOAT dot = 0.0;
@ -61,8 +61,7 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
i+=8 ; i+=8 ;
} }
*d += dot; return dot;
} }
#endif #endif
@ -82,8 +81,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 ) if ( n1 )
sdot_kernel_16(n1, x, y , &dot ); dot = sdot_kernel_16(n1, x, y);
i = n1; i = n1;
while(i < n) while(i < n)

View File

@ -34,101 +34,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) static float sdot_kernel_16 (long n, float *x, float *y)
{ {
float dot;
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
FLOAT tempdot[4];
__asm__ __volatile__
( (
"xxlxor 32,32,32 \n\t" "dcbt 0, %2 \n\t"
"xxlxor 33,33,33 \n\t" "dcbt 0, %3 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %12 \n\t" "xxlxor 32, 32, 32 \n\t"
"dcbt %3, %12 \n\t" "xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t" "lxvw4x 48, 0, %3 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %10, %2 \n\t"
"lxvw4x 49, %5, %3 \n\t" "lxvw4x 49, %10, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %11, %2 \n\t"
"lxvw4x 50, %6, %3 \n\t" "lxvw4x 50, %11, %3 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %12, %2 \n\t"
"lxvw4x 51, %7, %3 \n\t" "lxvw4x 51, %12, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %13, %2 \n\t"
"lxvw4x 52, %8, %3 \n\t" "lxvw4x %x4, %13, %3 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %14, %2 \n\t"
"lxvw4x 53, %9, %3 \n\t" "lxvw4x %x5, %14, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %15, %2 \n\t"
"lxvw4x 54, %10, %3 \n\t" "lxvw4x %x6, %15, %3 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x 47, %16, %2 \n\t"
"lxvw4x 55, %11, %3 \n\t" "lxvw4x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t" "lxvw4x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 33, 41, 49 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 41, %10, %2 \n\t"
"lxvw4x 49, %5, %3 \n\t" "lxvw4x 49, %10, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t" "xvmaddasp 34, 42, 50 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 42, %11, %2 \n\t"
"lxvw4x 50, %6, %3 \n\t" "lxvw4x 50, %11, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t" "xvmaddasp 35, 43, 51 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 43, %12, %2 \n\t"
"lxvw4x 51, %7, %3 \n\t" "lxvw4x 51, %12, %3 \n\t"
"xvmaddasp 36, 44, 52 \n\t" "xvmaddasp 36, 44, %x4 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 44, %13, %2 \n\t"
"lxvw4x 52, %8, %3 \n\t" "lxvw4x %x4, %13, %3 \n\t"
"xvmaddasp 37, 45, 53 \n\t" "xvmaddasp 37, 45, %x5 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 45, %14, %2 \n\t"
"lxvw4x 53, %9, %3 \n\t" "lxvw4x %x5, %14, %3 \n\t"
"xvmaddasp 38, 46, 54 \n\t" "xvmaddasp 38, 46, %x6 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvw4x 46, %15, %2 \n\t"
"lxvw4x 54, %10, %3 \n\t" "lxvw4x %x6, %15, %3 \n\t"
"xvmaddasp 39, 47, 55 \n\t" "xvmaddasp 39, 47, %x7 \n\t"
"lxvw4x 47, %16, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvw4x %x7, %16, %3 \n\t"
"lxvw4x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
@ -136,44 +120,56 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 33, 41, 49 \n\t"
"xvmaddasp 34, 42, 50 \n\t" "xvmaddasp 34, 42, 50 \n\t"
"xvmaddasp 35, 43, 51 \n\t" "xvmaddasp 35, 43, 51 \n\t"
"xvmaddasp 36, 44, 52 \n\t" "xvmaddasp 36, 44, %x4 \n\t"
"xvmaddasp 37, 45, 53 \n\t" "xvmaddasp 37, 45, %x5 \n\t"
"xvmaddasp 38, 46, 54 \n\t" "xvmaddasp 38, 46, %x6 \n\t"
"xvmaddasp 39, 47, 55 \n\t" "xvmaddasp 39, 47, %x7 \n\t"
"xvaddsp 32, 32 , 33 \n\t" "xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34 , 35 \n\t" "xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36 , 37 \n\t" "xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38 , 39 \n\t" "xvaddsp 38, 38, 39 \n\t"
"xvaddsp 32, 32 , 34 \n\t" "xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36 , 38 \n\t" "xvaddsp 36, 36, 38 \n\t"
"xvaddsp 32, 32 , 36 \n\t" "xvaddsp 32, 32, 36 \n\t"
"stxvw4x 32, 0 , %4 \n\t" "xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xscvspdp %x0, 32 \n"
"#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
"#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
: :
"=f" (dot), // 0
"+r" (n), // 1
"+b" (x), // 2
"+b" (y), // 3
"=wa" (t0), // 4
"=wa" (t1), // 5
"=wa" (t2), // 6
"=wa" (t3) // 7
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "m" (*y),
"r" (x1), // 2 "b" (16), // 10
"r" (y1), // 3 "b" (32), // 11
"r" (tempdot), // 4 "b" (48), // 12
"r" (o16), // 5 "b" (64), // 13
"r" (o32), // 6 "b" (80), // 14
"r" (o48), // 7 "b" (96), // 15
"r" (o64), // 8 "b" (112) // 16
"r" (o80), // 9 :
"r" (o96), // 10 "cr0",
"r" (o112), // 11 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (pre) // 12 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2" , "%3", "memory" "vs48","vs49","vs50","vs51"
); );
*dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3]; return dot;
} }

View File

@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16
static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -56,8 +56,6 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
FLOAT y00, y01, y02, y03; FLOAT y00, y01, y02, y03;
FLOAT *x1=x; FLOAT *x1=x;
FLOAT *y1=y; FLOAT *y1=y;
FLOAT c1=*c;
FLOAT s1=*s;
while ( i<n ) while ( i<n )
{ {
@ -71,14 +69,14 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
x03 = x1[3]; x03 = x1[3];
y03 = y1[3]; y03 = y1[3];
f0 = c1*x00 + s1*y00; f0 = c*x00 + s*y00;
g0 = c1*y00 - s1*x00; g0 = c*y00 - s*x00;
f1 = c1*x01 + s1*y01; f1 = c*x01 + s*y01;
g1 = c1*y01 - s1*x01; g1 = c*y01 - s*x01;
f2 = c1*x02 + s1*y02; f2 = c*x02 + s*y02;
g2 = c1*y02 - s1*x02; g2 = c*y02 - s*x02;
f3 = c1*x03 + s1*y03; f3 = c*x03 + s*y03;
g3 = c1*y03 - s1*x03; g3 = c*y03 - s*x03;
x1[0] = f0; x1[0] = f0;
y1[0] = g0; y1[0] = g0;
@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
FLOAT c1[4] __attribute__ ((aligned (16)));;
FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x; FLOAT *x1=x;
FLOAT *y1=y; FLOAT *y1=y;
FLOAT temp; FLOAT temp;
@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
c1[0]=c; srot_kernel_16(n1, x1, y1, c, s);
c1[1]=c;
c1[2]=c;
c1[3]=c;
s1[0]=s;
s1[1]=s;
s1[2]=s;
s1[3]=s;
srot_kernel_16(n1, x1, y1, c1, s1);
i=n1; i=n1;
} }

View File

@ -38,171 +38,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{ {
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;
__vector float t4;
__vector float t5;
__vector float t6;
__vector float t7;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
__asm__ __volatile__
( (
"xscvdpspn 36, %x13 \n\t" // load c to all words
"xxspltw 36, 36, 0 \n\t"
"lxvw4x 36 , 0, %3 \n\t" // load c "xscvdpspn 37, %x14 \n\t" // load s to all words
"lxvw4x 37 , 0, %4 \n\t" // load s "xxspltw 37, 37, 0 \n\t"
"addi %8 , %8, -4 \n\t"
"addi %9 , %9, -4 \n\t"
"lxvw4x 32, 0, %1 \n\t" // load x "lxvw4x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %5, %1 \n\t" "lxvw4x 33, %15, %3 \n\t"
"lxvw4x 34, %6, %1 \n\t" "lxvw4x 34, %16, %3 \n\t"
"lxvw4x 35, %7, %1 \n\t" "lxvw4x 35, %17, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t" // load y "lxvw4x 48, 0, %4 \n\t" // load y
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 49, %15, %4 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 50, %16, %4 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 51, %17, %4 \n\t"
"addi %1, %1, 64 \n\t" "addi %3, %3, 64 \n\t"
"addi %2, %2, 64 \n\t" "addi %4, %4, 64 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %2, %2, -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"xvmulsp 48, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 49, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 50, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"
"xvmulsp 51, 35, 36 \n\t" "xvmulsp 43, 35, 36 \n\t"
"xvmulsp 56, 40, 36 \n\t" // c * y "xvmulsp %x5, 48, 36 \n\t" // c * y
"xvmulsp 57, 41, 36 \n\t" "xvmulsp %x6, 49, 36 \n\t"
"xvmulsp 58, 42, 36 \n\t" "xvmulsp %x7, 50, 36 \n\t"
"xvmulsp 59, 43, 36 \n\t" "xvmulsp %x8, 51, 36 \n\t"
"xvmulsp 52, 32, 37 \n\t" // s * x "xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 53, 33, 37 \n\t" "xvmulsp 45, 33, 37 \n\t"
"lxvw4x 32, 0, %1 \n\t" // load x "lxvw4x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %5, %1 \n\t" "lxvw4x 33, %15, %3 \n\t"
"xvmulsp 54, 34, 37 \n\t" "xvmulsp 46, 34, 37 \n\t"
"xvmulsp 55, 35, 37 \n\t" "xvmulsp 47, 35, 37 \n\t"
"lxvw4x 34, %6, %1 \n\t" "lxvw4x 34, %16, %3 \n\t"
"lxvw4x 35, %7, %1 \n\t" "lxvw4x 35, %17, %3 \n\t"
"xvmulsp 60, 40, 37 \n\t" // s * y "xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp 61, 41, 37 \n\t" "xvmulsp %x10, 49, 37 \n\t"
"lxvw4x 40, 0, %2 \n\t" // load y "lxvw4x 48, 0, %4 \n\t" // load y
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 49, %15, %4 \n\t"
"xvmulsp 62, 42, 37 \n\t" "xvmulsp %x11, 50, 37 \n\t"
"xvmulsp 63, 43, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 50, %16, %4 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 51, %17, %4 \n\t"
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
"addi %1, %1, 64 \n\t" "addi %3, %3, -64 \n\t"
"addi %2, %2, 64 \n\t" "addi %4, %4, -64 \n\t"
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 48, 0, %8 \n\t" // store x "stxvw4x 40, 0, %3 \n\t" // store x
"stxvw4x 49, %5, %8 \n\t" "stxvw4x 41, %15, %3 \n\t"
"stxvw4x 50, %6, %8 \n\t" "stxvw4x 42, %16, %3 \n\t"
"stxvw4x 51, %7, %8 \n\t" "stxvw4x 43, %17, %3 \n\t"
"stxvw4x 56, 0, %9 \n\t" // store y "stxvw4x %x5, 0, %4 \n\t" // store y
"stxvw4x 57, %5, %9 \n\t" "stxvw4x %x6, %15, %4 \n\t"
"stxvw4x 58, %6, %9 \n\t" "stxvw4x %x7, %16, %4 \n\t"
"stxvw4x 59, %7, %9 \n\t" "stxvw4x %x8, %17, %4 \n\t"
"addi %8, %8, 64 \n\t" "addi %3, %3, 128 \n\t"
"addi %9, %9, 64 \n\t" "addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -16 \n\t" "addic. %2, %2, -16 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvmulsp 48, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 49, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 50, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"
"xvmulsp 51, 35, 36 \n\t" "xvmulsp 43, 35, 36 \n\t"
"xvmulsp 56, 40, 36 \n\t" // c * y "xvmulsp %x5, 48, 36 \n\t" // c * y
"xvmulsp 57, 41, 36 \n\t" "xvmulsp %x6, 49, 36 \n\t"
"xvmulsp 58, 42, 36 \n\t" "xvmulsp %x7, 50, 36 \n\t"
"xvmulsp 59, 43, 36 \n\t" "xvmulsp %x8, 51, 36 \n\t"
"xvmulsp 52, 32, 37 \n\t" // s * x "xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 53, 33, 37 \n\t" "xvmulsp 45, 33, 37 \n\t"
"xvmulsp 54, 34, 37 \n\t" "xvmulsp 46, 34, 37 \n\t"
"xvmulsp 55, 35, 37 \n\t" "xvmulsp 47, 35, 37 \n\t"
"xvmulsp 60, 40, 37 \n\t" // s * y "xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp 61, 41, 37 \n\t" "xvmulsp %x10, 49, 37 \n\t"
"xvmulsp 62, 42, 37 \n\t" "xvmulsp %x11, 50, 37 \n\t"
"xvmulsp 63, 43, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t"
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y "addi %3, %3, -64 \n\t"
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y "addi %4, %4, -64 \n\t"
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y
"stxvw4x 48, 0, %8 \n\t" // store x "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x
"stxvw4x 49, %5, %8 \n\t" "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x
"stxvw4x 50, %6, %8 \n\t" "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"stxvw4x 51, %7, %8 \n\t" "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 56, 0, %9 \n\t" // store y
"stxvw4x 57, %5, %9 \n\t"
"stxvw4x 58, %6, %9 \n\t"
"stxvw4x 59, %7, %9 \n\t"
"stxvw4x 40, 0, %3 \n\t" // store x
"stxvw4x 41, %15, %3 \n\t"
"stxvw4x 42, %16, %3 \n\t"
"stxvw4x 43, %17, %3 \n\t"
"stxvw4x %x5, 0, %4 \n\t" // store y
"stxvw4x %x6, %15, %4 \n\t"
"stxvw4x %x7, %16, %4 \n\t"
"stxvw4x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y), // 4
"=wa" (t0), // 5
"=wa" (t1), // 6
"=wa" (t2), // 7
"=wa" (t3), // 8
"=wa" (t4), // 9
"=wa" (t5), // 10
"=wa" (t6), // 11
"=wa" (t7) // 12
: :
"r" (i), // 0 "f" (c), // 13
"r" (x1), // 1 "f" (s), // 14
"r" (y1), // 2 "b" (16), // 15
"r" (c), // 3 "b" (32), // 16
"r" (s), // 4 "b" (48) // 17
"r" (o16), // 5 :
"r" (o32), // 6 "cr0",
"r" (o48), // 7 "vs32","vs33","vs34","vs35","vs36","vs37",
"r" (x2), // 8 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"r" (y2) // 9 "vs48","vs49","vs50","vs51"
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
); );
} }

View File

@ -42,11 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(HAVE_KERNEL_16) #if !defined(HAVE_KERNEL_16)
static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) static void sscal_kernel_16 (BLASLONG n, FLOAT *x, FLOAT alpha)
{ {
BLASLONG i; BLASLONG i;
FLOAT alpha = *da;
for( i=0; i<n; i+=8 ) for( i=0; i<n; i+=8 )
{ {
@ -63,7 +62,7 @@ static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
} }
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x ) static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
{ {
BLASLONG i; BLASLONG i;
@ -90,7 +89,6 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
BLASLONG i=0,j=0; BLASLONG i=0,j=0;
FLOAT alpha[4] __attribute__ ((aligned (16)));;
if ( n <= 0 || inc_x <=0 ) if ( n <= 0 || inc_x <=0 )
return(0); return(0);
@ -105,11 +103,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
alpha[0]=da; sscal_kernel_16_zero(n1, x);
alpha[1]=da;
alpha[2]=da;
alpha[3]=da;
sscal_kernel_16_zero(n1 , alpha , x);
j=n1; j=n1;
} }
@ -127,11 +121,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
alpha[0]=da; sscal_kernel_16(n1, x, da);
alpha[1]=da;
alpha[2]=da;
alpha[3]=da;
sscal_kernel_16(n1 , alpha , x);
j=n1; j=n1;
} }
while(j < n) while(j < n)

View File

@ -35,184 +35,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void sscal_kernel_16 (long n, float *x, float alpha)
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"lxvw4x 32, 0, %3 \n\t" "xscvdpspn %x3, %x3 \n\t"
"addi %1, %1, -4 \n\t" "xxspltw %x3, %x3, 0 \n\t"
"dcbt %2, %4 \n\t" "lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvw4x 34, %5, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvw4x 35, %6, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvw4x 36, %7, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvw4x 37, %8, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvw4x 38, %9, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvw4x 39, %10, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %4 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
"lxvw4x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
"lxvw4x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"lxvw4x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t"
"xvmulsp 48, 40, 32 \n\t" "addi %2, %2, -128 \n\t"
"xvmulsp 49, 41, 32 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"stxvw4x 48, 0, %1 \n\t" "stxvw4x 40, 0, %2 \n\t"
"stxvw4x 49, %5, %1 \n\t" "stxvw4x 41, %4, %2 \n\t"
"stxvw4x 50, %6, %1 \n\t" "stxvw4x 42, %5, %2 \n\t"
"stxvw4x 51, %7, %1 \n\t" "stxvw4x 43, %6, %2 \n\t"
"stxvw4x 52, %8, %1 \n\t" "stxvw4x 44, %7, %2 \n\t"
"stxvw4x 53, %9, %1 \n\t" "stxvw4x 45, %8, %2 \n\t"
"stxvw4x 54, %10, %1 \n\t" "stxvw4x 46, %9, %2 \n\t"
"stxvw4x 55, %11, %1 \n\t" "stxvw4x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"2: \n\t"
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
"addi %2, %2, -128 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
"stxvw4x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x), // 2
"+f" (alpha) // 3
:
"b" (16), // 4
"b" (32), // 5
"b" (48), // 6
"b" (64), // 7
"b" (80), // 8
"b" (96), // 9
"b" (112) // 10
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}
static void sscal_kernel_16_zero (long n, float *x)
{
__vector float t0;
__asm__
(
"xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n"
"1: \n\t"
"stxvw4x %x3, 0, %2 \n\t"
"stxvw4x %x3, %4, %2 \n\t"
"stxvw4x %x3, %5, %2 \n\t"
"stxvw4x %x3, %6, %2 \n\t"
"stxvw4x %x3, %7, %2 \n\t"
"stxvw4x %x3, %8, %2 \n\t"
"stxvw4x %x3, %9, %2 \n\t"
"stxvw4x %x3, %10, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t"
"xvmulsp 48, 40, 32 \n\t"
"xvmulsp 49, 41, 32 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"stxvw4x 48, 0, %1 \n\t"
"stxvw4x 49, %5, %1 \n\t"
"stxvw4x 50, %6, %1 \n\t"
"stxvw4x 51, %7, %1 \n\t"
"stxvw4x 52, %8, %1 \n\t"
"stxvw4x 53, %9, %1 \n\t"
"stxvw4x 54, %10, %1 \n\t"
"stxvw4x 55, %11, %1 \n\t"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :
"=m" (*x),
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0) // 3
: :
"r" (i), // 0 "b" (16), // 4
"r" (x2), // 1 "b" (32), // 5
"r" (x1), // 2 "b" (48), // 6
"r" (alpha), // 3 "b" (64), // 7
"r" (pre), // 4 "b" (80), // 8
"r" (o16), // 5 "b" (96), // 9
"r" (o32), // 6 "b" (112) // 10
"r" (o48), // 7 :
"r" (o64), // 8 "cr0"
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32 , 32 , 32 \n\t"
"addi %1, %1, -4 \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvw4x 32, 0, %1 \n\t"
"stxvw4x 32, %5, %1 \n\t"
"stxvw4x 32, %6, %1 \n\t"
"stxvw4x 32, %7, %1 \n\t"
"stxvw4x 32, %8, %1 \n\t"
"stxvw4x 32, %9, %1 \n\t"
"stxvw4x 32, %10, %1 \n\t"
"stxvw4x 32, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

View File

@ -35,57 +35,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1 #define HAVE_KERNEL_32 1
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void sswap_kernel_32 (long n, float *x, float *y)
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
".p2align 5 \n"
"addi %3, %3, -4 \n\t"
"addi %4, %4, -4 \n\t"
".align 5 \n\t"
"1: \n\t" "1: \n\t"
"lxvw4x 32, 0, %2 \n\t" "lxvw4x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %2 \n\t" "lxvw4x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %2 \n\t" "lxvw4x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %2 \n\t" "lxvw4x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %2 \n\t" "lxvw4x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %2 \n\t" "lxvw4x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %2 \n\t" "lxvw4x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %2 \n\t" "lxvw4x 39, %11, %4 \n\t"
"addi %2, %2, 128 \n\t" "lxvw4x 40, 0, %3 \n\t"
"lxvw4x 41, %5, %3 \n\t"
"lxvw4x 48, 0, %1 \n\t" "lxvw4x 42, %6, %3 \n\t"
"lxvw4x 49, %5, %1 \n\t" "lxvw4x 43, %7, %3 \n\t"
"lxvw4x 50, %6, %1 \n\t" "lxvw4x 44, %8, %3 \n\t"
"lxvw4x 51, %7, %1 \n\t" "lxvw4x 45, %9, %3 \n\t"
"lxvw4x 52, %8, %1 \n\t" "lxvw4x 46, %10, %3 \n\t"
"lxvw4x 53, %9, %1 \n\t" "lxvw4x 47, %11, %3 \n\t"
"lxvw4x 54, %10, %1 \n\t"
"lxvw4x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvw4x 33, %5, %3 \n\t"
@ -98,39 +71,38 @@ static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 48, 0, %4 \n\t" "stxvw4x 40, 0, %4 \n\t"
"stxvw4x 49, %5, %4 \n\t" "stxvw4x 41, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t" "stxvw4x 42, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t" "stxvw4x 43, %7, %4 \n\t"
"stxvw4x 52, %8, %4 \n\t" "stxvw4x 44, %8, %4 \n\t"
"stxvw4x 53, %9, %4 \n\t" "stxvw4x 45, %9, %4 \n\t"
"stxvw4x 54, %10, %4 \n\t" "stxvw4x 46, %10, %4 \n\t"
"stxvw4x 55, %11, %4 \n\t" "stxvw4x 47, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
: :
"r" (i), // 0 "b" (16), // 5
"r" (y1), // 1 "b" (32), // 6
"r" (x1), // 2 "b" (48), // 7
"r" (y2), // 3 "b" (64), // 8
"r" (x2), // 4 "b" (80), // 9
"r" (o16), // 5 "b" (96), // 10
"r" (o32), // 6 "b" (112) // 11
"r" (o48), // 7 :
"r" (o64), // 8 "cr0",
"r" (o80), // 9 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o96), // 10 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
); );
} }

View File

@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec) static FLOAT zasum_kernel_8(BLASLONG n, FLOAT *x1)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -92,9 +92,7 @@ static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
} }
svec[0] = sum0+sum1+sum2+sum3; return sum0+sum1+sum2+sum3;
svec[1] = 0.0;
} }
#endif #endif
@ -104,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0; BLASLONG i=0;
BLASLONG ip=0; BLASLONG ip=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1; BLASLONG n1;
BLASLONG inc_x2; BLASLONG inc_x2;
@ -117,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 ) if ( n1 > 0 )
{ {
zasum_kernel_8(n1, x, svec); sumf = zasum_kernel_8(n1, x);
sumf = svec[0] + svec[1];
i=n1; i=n1;
ip=2*n1; ip=2*n1;
} }

View File

@ -34,113 +34,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) static double zasum_kernel_8 (long n, double *x)
{ {
double sum;
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"dcbt %2 , %4 \n\t" "xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 32,32,32 \n\t" "xxlxor 34, 34, 34 \n\t"
"xxlxor 33,33,33 \n\t" "xxlxor 35, 35, 35 \n\t"
"xxlxor 34,34,34 \n\t" "xxlxor 36, 36, 36 \n\t"
"xxlxor 35,35,35 \n\t" "xxlxor 37, 37, 37 \n\t"
"xxlxor 36,36,36 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 37,37,37 \n\t" "xxlxor 39, 39, 39 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t" "xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t" "xvabsdp 51, 43 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"xvabsdp 52, 44 \n\t" "xvabsdp %x3, 44 \n\t"
"xvabsdp 53, 45 \n\t" "xvabsdp %x4, 45 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"xvabsdp 54, 46 \n\t" "xvabsdp %x5, 46 \n\t"
"xvabsdp 55, 47 \n\t" "xvabsdp %x6, 47 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"xvadddp 32, 32, 48 \n\t" "xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t" "xvadddp 33, 33, 49 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"xvadddp 34, 34, 50 \n\t" "xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t" "xvadddp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"xvadddp 36, 36, 52 \n\t" "xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, 53 \n\t" "xvadddp 37, 37, %x4 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"xvadddp 38, 38, 54 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, 55 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t" "xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t" "xvabsdp 51, 43 \n\t"
"xvabsdp 52, 44 \n\t" "xvabsdp %x3, 44 \n\t"
"xvabsdp 53, 45 \n\t" "xvabsdp %x4, 45 \n\t"
"xvabsdp 54, 46 \n\t" "xvabsdp %x5, 46 \n\t"
"xvabsdp 55, 47 \n\t" "xvabsdp %x6, 47 \n\t"
"xvadddp 32, 32, 48 \n\t" "xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t" "xvadddp 33, 33, 49 \n\t"
"xvadddp 34, 34, 50 \n\t" "xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t" "xvadddp 35, 35, 51 \n\t"
"xvadddp 36, 36, 52 \n\t" "xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, 53 \n\t" "xvadddp 37, 37, %x4 \n\t"
"xvadddp 38, 38, 54 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, 55 \n\t" "xvadddp 39, 39, %x6 \n\t"
"xvadddp 32, 32, 33 \n\t" "xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t" "xvadddp 34, 34, 35 \n\t"
@ -152,26 +140,34 @@ static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
"xsadddp %x0, 32, 33 \n"
"stxvd2x 32, 0, %3 \n\t" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
: :
"=d" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "b" (16), // 8
"r" (x1), // 2 "b" (32), // 9
"r" (svec), // 3 "b" (48), // 10
"r" (pre), // 4 "b" (64), // 11
"r" (o16), // 5 "b" (80), // 12
"r" (o32), // 6 "b" (96), // 13
"r" (o48), // 7 "b" (112) // 14
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
: "cr0", "%0", "%2", "memory" "vs48","vs49","vs50","vs51"
); );
return sum;
} }

View File

@ -78,7 +78,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
FLOAT da[4];
if ( n <= 0 ) return(0); if ( n <= 0 ) return(0);
@ -89,11 +88,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if ( n1 ) if ( n1 )
{ {
da[0] = da_r; zaxpy_kernel_4 (n1, x, y, da_r, da_i);
da[1] = da_r;
da[2] = da_i;
da[3] = da_i;
zaxpy_kernel_4(n1, x, y , da );
ix = 2 * n1; ix = 2 * n1;
} }
i = n1; i = n1;

View File

@ -35,162 +35,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_4 1 #define HAVE_KERNEL_4 1
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void zaxpy_kernel_4 (long n, double *x, double *y,
double alpha_r, double alpha_i)
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{ {
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *y2=y+1;
BLASLONG pre = 384;
#if !defined(CONJ) #if !defined(CONJ)
FLOAT mvec[2] = { -1.0, 1.0 }; static const double mvec[2] = { -1.0, 1.0 };
#else #else
FLOAT mvec[2] = { 1.0, -1.0 }; static const double mvec[2] = { 1.0, -1.0 };
#endif #endif
const double *mvecp = mvec;
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__vector double t6;
__vector double t7;
__vector double t8;
__vector double t9;
__vector double t10;
__vector double t11;
long ytmp;
__asm__ __volatile__ __asm__
( (
"xxspltd 32, %x19, 0 \n\t" // alpha_r
"xxspltd 33, %x20, 0 \n\t" // alpha_i
"lxsdx 34, 0 , %4 \n\t" // alpha_r "lxvd2x 36, 0, %21 \n\t" // mvec
"lxsdx 35, %5, %4 \n\t" // alpha_i
"xxspltd 32, 34, 0 \n\t"
"xxspltd 33, 35, 0 \n\t"
"lxvd2x 36, 0, %9 \n\t" // mvec
#if !defined(CONJ) #if !defined(CONJ)
"xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec
#else #else
"xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec
#endif #endif
"addi %8, %8, -8 \n\t" "mr %16, %3 \n\t"
"dcbt 0, %2 \n\t"
"dcbt %2, %10 \n\t" "dcbt 0, %3 \n\t"
"dcbt %3, %10 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0 "lxvd2x 40, 0, %2 \n\t" // x0
"lxvd2x 41, %5, %2 \n\t" // x1 "lxvd2x 41, %22, %2 \n\t" // x1
"lxvd2x 42, %6, %2 \n\t" // x2 "lxvd2x 42, %23, %2 \n\t" // x2
"lxvd2x 43, %7, %2 \n\t" // x3 "lxvd2x 43, %24, %2 \n\t" // x3
"lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %5, %3 \n\t" // y1 "lxvd2x 49, %22, %3 \n\t" // y1
"lxvd2x 50, %6, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %7, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd 56, 40 \n\t" // exchange real and imag part "xxswapd %x8, 40 \n\t" // exchange real and imag part
"xxswapd 57, 41 \n\t" // exchange real and imag part "xxswapd %x9, 41 \n\t" // exchange real and imag part
"xxswapd 58, 42 \n\t" // exchange real and imag part "xxswapd %x10, 42 \n\t" // exchange real and imag part
"xxswapd 59, 43 \n\t" // exchange real and imag part "xxswapd %x11, 43 \n\t" // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x4 "lxvd2x 44, 0, %2 \n\t" // x4
"lxvd2x 45, %5, %2 \n\t" // x5 "lxvd2x 45, %22, %2 \n\t" // x5
"lxvd2x 46, %6, %2 \n\t" // x6 "lxvd2x 46, %23, %2 \n\t" // x6
"lxvd2x 47, %7, %2 \n\t" // x7 "lxvd2x 47, %24, %2 \n\t" // x7
"lxvd2x 52, 0, %3 \n\t" // y4 "lxvd2x %x4, 0, %3 \n\t" // y4
"lxvd2x 53, %5, %3 \n\t" // y5 "lxvd2x %x5, %22, %3 \n\t" // y5
"lxvd2x 54, %6, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x 55, %7, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"xxswapd 60, 44 \n\t" // exchange real and imag part "xxswapd %x12, 44 \n\t" // exchange real and imag part
"xxswapd 61, 45 \n\t" // exchange real and imag part "xxswapd %x13, 45 \n\t" // exchange real and imag part
"xxswapd 62, 46 \n\t" // exchange real and imag part "xxswapd %x14, 46 \n\t" // exchange real and imag part
"xxswapd 63, 47 \n\t" // exchange real and imag part "xxswapd %x15, 47 \n\t" // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %10 \n\t"
"dcbt %3, %10 \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 49, 41, 32 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0 "lxvd2x 40, 0, %2 \n\t" // x0
"lxvd2x 41, %5, %2 \n\t" // x1 "lxvd2x 41, %22, %2 \n\t" // x1
"xvmaddadp 50, 42, 32 \n\t" "xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t" "xvmaddadp 51, 43, 32 \n\t"
"lxvd2x 42, %6, %2 \n\t" // x2 "lxvd2x 42, %23, %2 \n\t" // x2
"lxvd2x 43, %7, %2 \n\t" // x3 "lxvd2x 43, %24, %2 \n\t" // x3
"xvmaddadp 52, 44, 32 \n\t" "xvmaddadp %x4, 44, 32 \n\t"
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"xvmaddadp 53, 45, 32 \n\t" "xvmaddadp %x5, 45, 32 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x4 "lxvd2x 44, 0, %2 \n\t" // x4
"lxvd2x 45, %5, %2 \n\t" // x5 "lxvd2x 45, %22, %2 \n\t" // x5
"xvmaddadp 54, 46, 32 \n\t" "xvmaddadp %x6, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t" "xvmaddadp %x7, 47, 32 \n\t"
"lxvd2x 46, %6, %2 \n\t" // x6 "lxvd2x 46, %23, %2 \n\t" // x6
"lxvd2x 47, %7, %2 \n\t" // x7 "lxvd2x 47, %24, %2 \n\t" // x7
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"xvmaddadp 49, 57, 33 \n\t" "xvmaddadp 49, %x9, 33 \n\t"
"xvmaddadp 50, 58, 33 \n\t" "xvmaddadp 50, %x10, 33 \n\t"
"xvmaddadp 51, 59, 33 \n\t" "xvmaddadp 51, %x11, 33 \n\t"
"xvmaddadp 52, 60, 33 \n\t" "xvmaddadp %x4, %x12, 33 \n\t"
"xvmaddadp 53, 61, 33 \n\t" "xvmaddadp %x5, %x13, 33 \n\t"
"xvmaddadp 54, 62, 33 \n\t" "xvmaddadp %x6, %x14, 33 \n\t"
"xvmaddadp 55, 63, 33 \n\t" "xvmaddadp %x7, %x15, 33 \n\t"
"stxvd2x 48, 0, %8 \n\t" "stxvd2x 48, 0, %16 \n\t"
"stxvd2x 49, %5, %8 \n\t" "stxvd2x 49, %22, %16 \n\t"
"stxvd2x 50, %6, %8 \n\t" "stxvd2x 50, %23, %16 \n\t"
"stxvd2x 51, %7, %8 \n\t" "stxvd2x 51, %24, %16 \n\t"
"addi %8, %8, 64 \n\t" "addi %16, %16, 64 \n\t"
"stxvd2x 52, 0, %8 \n\t" "stxvd2x %x4, 0, %16 \n\t"
"stxvd2x 53, %5, %8 \n\t" "stxvd2x %x5, %22, %16 \n\t"
"stxvd2x 54, %6, %8 \n\t" "stxvd2x %x6, %23, %16 \n\t"
"stxvd2x 55, %7, %8 \n\t" "stxvd2x %x7, %24, %16 \n\t"
"addi %8, %8, 64 \n\t" "addi %16, %16, 64 \n\t"
"xxswapd 56, 40 \n\t" // exchange real and imag part "xxswapd %x8, 40 \n\t" // exchange real and imag part
"xxswapd 57, 41 \n\t" // exchange real and imag part "xxswapd %x9, 41 \n\t" // exchange real and imag part
"lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %5, %3 \n\t" // y1 "lxvd2x 49, %22, %3 \n\t" // y1
"xxswapd 58, 42 \n\t" // exchange real and imag part "xxswapd %x10, 42 \n\t" // exchange real and imag part
"xxswapd 59, 43 \n\t" // exchange real and imag part "xxswapd %x11, 43 \n\t" // exchange real and imag part
"lxvd2x 50, %6, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %7, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd 60, 44 \n\t" // exchange real and imag part "xxswapd %x12, 44 \n\t" // exchange real and imag part
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 61, 45 \n\t" // exchange real and imag part "xxswapd %x13, 45 \n\t" // exchange real and imag part
"lxvd2x 52, 0, %3 \n\t" // y4 "lxvd2x %x4, 0, %3 \n\t" // y4
"lxvd2x 53, %5, %3 \n\t" // y5 "lxvd2x %x5, %22, %3 \n\t" // y5
"xxswapd 62, 46 \n\t" // exchange real and imag part "xxswapd %x14, 46 \n\t" // exchange real and imag part
"xxswapd 63, 47 \n\t" // exchange real and imag part "xxswapd %x15, 47 \n\t" // exchange real and imag part
"lxvd2x 54, %6, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x 55, %7, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
@ -199,52 +194,66 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"xvmaddadp 50, 42, 32 \n\t" "xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t" "xvmaddadp 51, 43, 32 \n\t"
"xvmaddadp 52, 44, 32 \n\t" "xvmaddadp %x4, 44, 32 \n\t"
"xvmaddadp 53, 45, 32 \n\t" "xvmaddadp %x5, 45, 32 \n\t"
"xvmaddadp 54, 46, 32 \n\t" "xvmaddadp %x6, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t" "xvmaddadp %x7, 47, 32 \n\t"
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"xvmaddadp 49, 57, 33 \n\t" "xvmaddadp 49, %x9, 33 \n\t"
"xvmaddadp 50, 58, 33 \n\t" "xvmaddadp 50, %x10, 33 \n\t"
"xvmaddadp 51, 59, 33 \n\t" "xvmaddadp 51, %x11, 33 \n\t"
"xvmaddadp 52, 60, 33 \n\t" "xvmaddadp %x4, %x12, 33 \n\t"
"xvmaddadp 53, 61, 33 \n\t" "xvmaddadp %x5, %x13, 33 \n\t"
"xvmaddadp 54, 62, 33 \n\t" "xvmaddadp %x6, %x14, 33 \n\t"
"xvmaddadp 55, 63, 33 \n\t" "xvmaddadp %x7, %x15, 33 \n\t"
"stxvd2x 48, 0, %16 \n\t"
"stxvd2x 49, %22, %16 \n\t"
"stxvd2x 50, %23, %16 \n\t"
"stxvd2x 51, %24, %16 \n\t"
"stxvd2x 48, 0, %8 \n\t" "addi %16, %16, 64 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"addi %8, %8, 64 \n\t" "stxvd2x %x4, 0, %16 \n\t"
"stxvd2x %x5, %22, %16 \n\t"
"stxvd2x 52, 0, %8 \n\t" "stxvd2x %x6, %23, %16 \n\t"
"stxvd2x 53, %5, %8 \n\t" "stxvd2x %x7, %24, %16 \n"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n"
"#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15"
: :
"+m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y), // 3
"=wa" (t0), // 4
"=wa" (t1), // 5
"=wa" (t2), // 6
"=wa" (t3), // 7
"=wa" (t4), // 8
"=wa" (t5), // 9
"=wa" (t6), // 10
"=wa" (t7), // 11
"=wa" (t8), // 12
"=wa" (t9), // 13
"=wa" (t10), // 14
"=wa" (t11), // 15
"=b" (ytmp) // 16
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "m" (*mvecp),
"r" (x1), // 2 "d" (alpha_r), // 19
"r" (y1), // 3 "d" (alpha_i), // 20
"r" (alpha), // 4 "16" (mvecp), // 21
"r" (o16), // 5 "b" (16), // 22
"r" (o32), // 6 "b" (32), // 23
"r" (o48), // 7 "b" (48) // 24
"r" (y2), // 8 :
"r" (mvec), // 9 "cr0",
"r" (pre) // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
: "cr0", "%0", "%2" , "%3", "%8", "memory" "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
); );
} }

View File

@ -35,27 +35,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
@ -68,107 +61,95 @@ static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"lxvd2x 50, 0, %2 \n\t" "addic. %1, %1, -16 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvd2x 40, 0, %1 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 41, %5, %1 \n\t" "stxvd2x 33, %5, %3 \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %1 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %1 \n\t" "stxvd2x 43, %7, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %1 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %1 \n\t" "stxvd2x 45, %9, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %1 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %1 \n\t" "stxvd2x 47, %11, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t" "addic. %1, %1, -16 \n\t"
"stxvd2x 51, %5, %1 \n\t" "bgt 1b \n"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t" "2: \n\t"
"stxvd2x 40, 0, %1 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 41, %5, %1 \n\t" "stxvd2x 33, %5, %3 \n\t"
"stxvd2x 42, %6, %1 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvd2x 43, %7, %1 \n\t" "stxvd2x 35, %7, %3 \n\t"
"stxvd2x 44, %8, %1 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvd2x 45, %9, %1 \n\t" "stxvd2x 37, %9, %3 \n\t"
"stxvd2x 46, %10, %1 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvd2x 47, %11, %1 \n\t" "stxvd2x 39, %11, %3 \n\t"
"addi %1, %1, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
: :
"r" (i), // 0 "m" (*x),
"r" (y1), // 1 "b" (16), // 5
"r" (x1), // 2 "b" (32), // 6
"r" (alpha), // 3 "b" (48), // 7
"r" (pre), // 4 "b" (64), // 8
"r" (o16), // 5 "b" (80), // 9
"r" (o32), // 6 "b" (96), // 10
"r" (o48), // 7 "b" (112) // 11
"r" (o64), // 8 :
"r" (o80), // 9 "cr0",
"r" (o96), // 10 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o112) // 11 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
: "cr0", "%0", "%2" , "%1", "memory"
); );
} }

View File

@ -43,8 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;

View File

@ -34,136 +34,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"xxlxor 32,32,32 \n\t" "dcbt 0, %2 \n\t"
"xxlxor 33,33,33 \n\t" "dcbt 0, %3 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %8 \n\t" "xxlxor 32, 32, 32 \n\t"
"dcbt %3, %8 \n\t" "xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 52,48 \n\t" // y0_i, y0_r "xxswapd 0, 48 \n\t" // y0_i, y0_r
"xxswapd 53,49 \n\t" // y1_i, y1_r "xxswapd 1, 49 \n\t" // y1_i, y1_r
"xxswapd 54,50 \n\t" // y2_i, y2_r "xxswapd 2, 50 \n\t" // y2_i, y2_r
"xxswapd 55,51 \n\t" // y3_i, y3_r "xxswapd 3, 51 \n\t" // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 60,56 \n\t" // y0_i, y0_r "xxswapd 8, 4 \n\t" // y0_i, y0_r
"xxswapd 61,57 \n\t" // y1_i, y1_r "xxswapd 9, 5 \n\t" // y1_i, y1_r
"xxswapd 62,58 \n\t" // y2_i, y2_r "xxswapd 10, 6 \n\t" // y2_i, y2_r
"xxswapd 63,59 \n\t" // y3_i, y3_r "xxswapd 11, 7 \n\t" // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %8 \n\t"
"dcbt %3, %8 \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 52,48 \n\t" // y0_i, y0_r "xxswapd 0,48 \n\t" // y0_i, y0_r
"xxswapd 53,49 \n\t" // y1_i, y1_r "xxswapd 1,49 \n\t" // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 54,50 \n\t" // y2_i, y2_r "xxswapd 2,50 \n\t" // y2_i, y2_r
"xxswapd 55,51 \n\t" // y3_i, y3_r "xxswapd 3,51 \n\t" // y3_i, y3_r
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r "xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r "xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r "xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 60,56 \n\t" // y0_i, y0_r "xxswapd 8,4 \n\t" // y0_i, y0_r
"xxswapd 61,57 \n\t" // y1_i, y1_r "xxswapd 9,5 \n\t" // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 62,58 \n\t" // y2_i, y2_r "xxswapd 10,6 \n\t" // y2_i, y2_r
"xxswapd 63,59 \n\t" // y3_i, y3_r "xxswapd 11,7 \n\t" // y3_i, y3_r
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n\t" "bgt 1b \n"
"2: \n\t" "2: \n\t"
@ -172,21 +157,20 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvadddp 32, 32, 34 \n\t" "xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t" "xvadddp 36, 36, 38 \n\t"
@ -197,23 +181,27 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xvadddp 33, 33, 37 \n\t" "xvadddp 33, 33, 37 \n\t"
"stxvd2x 32, 0, %4 \n\t" "stxvd2x 32, 0, %6 \n\t"
"stxvd2x 33, %5, %4 \n\t" "stxvd2x 33, %7, %6 \n"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6 o16=%7 o32=%8 o48=%9"
: :
"=m" (*dot),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
: :
"r" (i), // 0 "m" (*x),
"r" (n), // 1 "m" (*y),
"r" (x1), // 2 "b" (dot), // 6
"r" (y1), // 3 "b" (16), // 7
"r" (dot), // 4 "b" (32), // 8
"r" (o16), // 5 "b" (48) // 9
"r" (o32), // 6 :
"r" (o48), // 7 "cr0",
"r" (pre) // 8 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
: "cr0", "%0", "%2" , "%3", "memory" "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
"vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
); );
} }

View File

@ -47,15 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha) static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT da_r, FLOAT da_i)
{ {
BLASLONG i=0; BLASLONG i=0;
FLOAT *x1=x; FLOAT *x1=x;
FLOAT alpha_r1=alpha[0]; FLOAT alpha_r1=da_r;
FLOAT alpha_r2=alpha[1]; FLOAT alpha_r2=da_r;
FLOAT alpha_i1=alpha[2]; FLOAT alpha_i1=-da_i;
FLOAT alpha_i2=alpha[3]; FLOAT alpha_i2=da_i;
FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31; FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i; FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
@ -116,7 +116,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
BLASLONG inc_x2; BLASLONG inc_x2;
BLASLONG ip = 0; BLASLONG ip = 0;
FLOAT temp; FLOAT temp;
FLOAT alpha[4] __attribute__ ((aligned (16)));;
BLASLONG n1; BLASLONG n1;
if ( n <= 0 ) if ( n <= 0 )
@ -147,11 +146,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
n1 = n & -8; n1 = n & -8;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
alpha[0] = da_r; zscal_kernel_8(n1, x, da_r, da_i);
alpha[1] = da_r;
alpha[2] = -da_i;
alpha[3] = da_i;
zscal_kernel_8(n1, x, alpha);
i=n1; i=n1;
ip = n1 * 2; ip = n1 * 2;

View File

@ -38,121 +38,116 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1 #define HAVE_KERNEL_8 1
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline)); static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
{ {
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__vector double t6;
__vector double t7;
__vector double t8;
__vector double t9;
__vector double t10;
__vector double t11;
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
( (
"dcbt 0, %2 \n\t"
"lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r "xsnegdp 33, %x16 \n\t" // -alpha_i
"lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r
"addi %1, %1, -8 \n\t" "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i
"dcbt %2, %4 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %17, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %18, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %19, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %20, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %21, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %22, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %23, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble 2f \n\t"
".align 5 \n\t" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"dcbt %2, %4 \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t" "xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t" "xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t" "xvmuldp %x3, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t" "xvmuldp %x4, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd 56, 40 \n\t" "xxswapd %x7, 40 \n\t"
"xxswapd 57, 41 \n\t" "xxswapd %x8, 41 \n\t"
"xxswapd 58, 42 \n\t" "xxswapd %x9, 42 \n\t"
"xxswapd 59, 43 \n\t" "xxswapd %x10, 43 \n\t"
"xxswapd 60, 44 \n\t" "xxswapd %x11, 44 \n\t"
"xxswapd 61, 45 \n\t" "xxswapd %x12, 45 \n\t"
"xxswapd 62, 46 \n\t" "xxswapd %x13, 46 \n\t"
"xxswapd 63, 47 \n\t" "xxswapd %x14, 47 \n\t"
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp 57, 57, 33 \n\t" "xvmuldp %x8, %x8, 33 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %5, %2 \n\t" "lxvd2x 41, %17, %2 \n\t"
"xvmuldp 58, 58, 33 \n\t" "xvmuldp %x9, %x9, 33 \n\t"
"xvmuldp 59, 59, 33 \n\t" "xvmuldp %x10, %x10, 33 \n\t"
"lxvd2x 42, %6, %2 \n\t" "lxvd2x 42, %18, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t" "lxvd2x 43, %19, %2 \n\t"
"xvmuldp 60, 60, 33 \n\t" "xvmuldp %x11, %x11, 33 \n\t"
"xvmuldp 61, 61, 33 \n\t" "xvmuldp %x12, %x12, 33 \n\t"
"lxvd2x 44, %8, %2 \n\t" "lxvd2x 44, %20, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t" "lxvd2x 45, %21, %2 \n\t"
"xvmuldp 62, 62, 33 \n\t" "xvmuldp %x13, %x13, 33 \n\t"
"xvmuldp 63, 63, 33 \n\t" "xvmuldp %x14, %x14, 33 \n\t"
"lxvd2x 46, %10, %2 \n\t" "lxvd2x 46, %22, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t" "lxvd2x 47, %23, %2 \n\t"
"xvadddp 48, 48 , 56 \n\t" "addi %2, %2, -128 \n\t"
"xvadddp 49, 49 , 57 \n\t"
"xvadddp 50, 50 , 58 \n\t"
"xvadddp 51, 51 , 59 \n\t"
"stxvd2x 48, 0, %1 \n\t" "xvadddp 48, 48, %x7 \n\t"
"stxvd2x 49, %5, %1 \n\t" "xvadddp 49, 49, %x8 \n\t"
"xvadddp 50, 50, %x9 \n\t"
"xvadddp 51, 51, %x10 \n\t"
"xvadddp 52, 52 , 60 \n\t" "stxvd2x 48, 0, %2 \n\t"
"xvadddp 53, 53 , 61 \n\t" "stxvd2x 49, %17, %2 \n\t"
"stxvd2x 50, %6, %1 \n\t" "xvadddp %x3, %x3, %x11 \n\t"
"stxvd2x 51, %7, %1 \n\t" "xvadddp %x4, %x4, %x12 \n\t"
"xvadddp 54, 54 , 62 \n\t" "stxvd2x 50, %18, %2 \n\t"
"xvadddp 55, 55 , 63 \n\t" "stxvd2x 51, %19, %2 \n\t"
"stxvd2x 52, %8, %1 \n\t" "xvadddp %x5, %x5, %x13 \n\t"
"stxvd2x 53, %9, %1 \n\t" "xvadddp %x6, %x6, %x14 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t" "stxvd2x %x3, %20, %2 \n\t"
"addi %2, %2, 128 \n\t" "stxvd2x %x4, %21, %2 \n\t"
"stxvd2x %x5, %22, %2 \n\t"
"stxvd2x %x6, %23, %2 \n\t"
"addic. %0 , %0 , -8 \n\t" "addi %2, %2, 256 \n\t"
"bgt 1b \n\t"
"addic. %1, %1, -8 \n\t"
"bgt 1b \n"
"2: \n\t" "2: \n\t"
@ -160,65 +155,85 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t" "xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t" "xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t" "xvmuldp %x3, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t" "xvmuldp %x4, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd 56, 40 \n\t" "xxswapd %x7, 40 \n\t"
"xxswapd 57, 41 \n\t" "xxswapd %x8, 41 \n\t"
"xxswapd 58, 42 \n\t" "xxswapd %x9, 42 \n\t"
"xxswapd 59, 43 \n\t" "xxswapd %x10, 43 \n\t"
"xxswapd 60, 44 \n\t" "xxswapd %x11, 44 \n\t"
"xxswapd 61, 45 \n\t" "xxswapd %x12, 45 \n\t"
"xxswapd 62, 46 \n\t" "xxswapd %x13, 46 \n\t"
"xxswapd 63, 47 \n\t" "xxswapd %x14, 47 \n\t"
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "addi %2, %2, -128 \n\t"
"xvmuldp 57, 57, 33 \n\t"
"xvmuldp 58, 58, 33 \n\t"
"xvmuldp 59, 59, 33 \n\t"
"xvmuldp 60, 60, 33 \n\t"
"xvmuldp 61, 61, 33 \n\t"
"xvmuldp 62, 62, 33 \n\t"
"xvmuldp 63, 63, 33 \n\t"
"xvadddp 48, 48 , 56 \n\t" "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvadddp 49, 49 , 57 \n\t" "xvmuldp %x8, %x8, 33 \n\t"
"xvadddp 50, 50 , 58 \n\t" "xvmuldp %x9, %x9, 33 \n\t"
"xvadddp 51, 51 , 59 \n\t" "xvmuldp %x10, %x10, 33 \n\t"
"xvadddp 52, 52 , 60 \n\t" "xvmuldp %x11, %x11, 33 \n\t"
"xvadddp 53, 53 , 61 \n\t" "xvmuldp %x12, %x12, 33 \n\t"
"xvadddp 54, 54 , 62 \n\t" "xvmuldp %x13, %x13, 33 \n\t"
"xvadddp 55, 55 , 63 \n\t" "xvmuldp %x14, %x14, 33 \n\t"
"stxvd2x 48, 0, %1 \n\t" "xvadddp 48, 48, %x7 \n\t"
"stxvd2x 49, %5, %1 \n\t" "xvadddp 49, 49, %x8 \n\t"
"stxvd2x 50, %6, %1 \n\t" "xvadddp 50, 50, %x9 \n\t"
"stxvd2x 51, %7, %1 \n\t" "xvadddp 51, 51, %x10 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
"stxvd2x 48, 0, %2 \n\t"
"stxvd2x 49, %17, %2 \n\t"
"xvadddp %x3, %x3, %x11 \n\t"
"xvadddp %x4, %x4, %x12 \n\t"
"stxvd2x 50, %18, %2 \n\t"
"stxvd2x 51, %19, %2 \n\t"
"xvadddp %x5, %x5, %x13 \n\t"
"xvadddp %x6, %x6, %x14 \n\t"
"stxvd2x %x3, %20, %2 \n\t"
"stxvd2x %x4, %21, %2 \n\t"
"stxvd2x %x5, %22, %2 \n\t"
"stxvd2x %x6, %23, %2 \n"
"#n=%1 x=%0=%2 alpha=(%15,%16) o16=%17 o32=%18 o48=%19 o64=%20 o80=%21 o96=%22 o112=%23\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6 t4=%x7 t5=%x8 t6=%x9 t7=%x10 t8=%x11 t9=%x12 t10=%x13 t11=%x14"
: :
"+m" (*x),
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3), // 6
"=wa" (t4), // 7
"=wa" (t5), // 8
"=wa" (t6), // 9
"=wa" (t7), // 10
"=wa" (t8), // 11
"=wa" (t9), // 12
"=wa" (t10), // 13
"=wa" (t11) // 14
: :
"r" (i), // 0 "d" (alpha_r), // 15
"r" (x2), // 1 "d" (alpha_i), // 16
"r" (x1), // 2 "b" (16), // 17
"r" (alpha), // 3 "b" (32), // 18
"r" (pre), // 4 "b" (48), // 19
"r" (o16), // 5 "b" (64), // 20
"r" (o32), // 6 "b" (80), // 21
"r" (o48), // 7 "b" (96), // 22
"r" (o64), // 8 "b" (112) // 23
"r" (o80), // 9 :
"r" (o96), // 10 "cr0",
"r" (o112) // 11 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
: "cr0", "%0", "%2" , "%1", "memory" "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
); );
} }

View File

@ -35,79 +35,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1 #define HAVE_KERNEL_16 1
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void
zswap_kernel_16 (long n, double *x, double *y)
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
( (
".p2align 5 \n"
"addi %3, %3, -8 \n\t"
"addi %4, %4, -8 \n\t"
".align 5 \n\t"
"1: \n\t" "1: \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t"
"lxvd2x 35, %7, %4 \n\t"
"lxvd2x 36, %8, %4 \n\t"
"lxvd2x 37, %9, %4 \n\t"
"lxvd2x 38, %10, %4 \n\t"
"lxvd2x 39, %11, %4 \n\t"
"lxvd2x 32, 0, %2 \n\t" "addi %4, %4, 128 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "lxvd2x 40, 0, %4 \n\t"
"lxvd2x 41, %5, %4 \n\t"
"lxvd2x 42, %6, %4 \n\t"
"lxvd2x 43, %7, %4 \n\t"
"lxvd2x 44, %8, %4 \n\t"
"lxvd2x 45, %9, %4 \n\t"
"lxvd2x 46, %10, %4 \n\t"
"lxvd2x 47, %11, %4 \n\t"
"lxvd2x 40, 0, %2 \n\t" "addi %4, %4, -128 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"lxvd2x 0, %8, %3 \n\t"
"lxvd2x 1, %9, %3 \n\t"
"lxvd2x 2, %10, %3 \n\t"
"lxvd2x 3, %11, %3 \n\t"
"lxvd2x 48, 0, %1 \n\t" "addi %3, %3, 128 \n\t"
"lxvd2x 49, %5, %1 \n\t"
"lxvd2x 50, %6, %1 \n\t"
"lxvd2x 51, %7, %1 \n\t"
"lxvd2x 52, %8, %1 \n\t"
"lxvd2x 53, %9, %1 \n\t"
"lxvd2x 54, %10, %1 \n\t"
"lxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t" "lxvd2x 4, 0, %3 \n\t"
"lxvd2x 5, %5, %3 \n\t"
"lxvd2x 6, %6, %3 \n\t"
"lxvd2x 7, %7, %3 \n\t"
"lxvd2x 8, %8, %3 \n\t"
"lxvd2x 9, %9, %3 \n\t"
"lxvd2x 10, %10, %3 \n\t"
"lxvd2x 11, %11, %3 \n\t"
"lxvd2x 56, 0, %1 \n\t" "addi %3, %3, -128 \n\t"
"lxvd2x 57, %5, %1 \n\t"
"lxvd2x 58, %6, %1 \n\t"
"lxvd2x 59, %7, %1 \n\t"
"lxvd2x 60, %8, %1 \n\t"
"lxvd2x 61, %9, %1 \n\t"
"lxvd2x 62, %10, %1 \n\t"
"lxvd2x 63, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -135,46 +112,46 @@ static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
"stxvd2x 49, %5, %4 \n\t" "stxvd2x 49, %5, %4 \n\t"
"stxvd2x 50, %6, %4 \n\t" "stxvd2x 50, %6, %4 \n\t"
"stxvd2x 51, %7, %4 \n\t" "stxvd2x 51, %7, %4 \n\t"
"stxvd2x 52, %8, %4 \n\t" "stxvd2x 0, %8, %4 \n\t"
"stxvd2x 53, %9, %4 \n\t" "stxvd2x 1, %9, %4 \n\t"
"stxvd2x 54, %10, %4 \n\t" "stxvd2x 2, %10, %4 \n\t"
"stxvd2x 55, %11, %4 \n\t" "stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"stxvd2x 56, 0, %4 \n\t" "stxvd2x 4, 0, %4 \n\t"
"stxvd2x 57, %5, %4 \n\t" "stxvd2x 5, %5, %4 \n\t"
"stxvd2x 58, %6, %4 \n\t" "stxvd2x 6, %6, %4 \n\t"
"stxvd2x 59, %7, %4 \n\t" "stxvd2x 7, %7, %4 \n\t"
"stxvd2x 60, %8, %4 \n\t" "stxvd2x 8, %8, %4 \n\t"
"stxvd2x 61, %9, %4 \n\t" "stxvd2x 9, %9, %4 \n\t"
"stxvd2x 62, %10, %4 \n\t" "stxvd2x 10, %10, %4 \n\t"
"stxvd2x 63, %11, %4 \n\t" "stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t"
"bgt 1b \n"
"addic. %0 , %0 , -16 \n\t" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
"bgt 1b \n\t"
"2: \n\t"
: :
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
: :
"r" (i), // 0 "b" (16), // 5
"r" (y1), // 1 "b" (32), // 6
"r" (x1), // 2 "b" (48), // 7
"r" (y2), // 3 "b" (64), // 8
"r" (x2), // 4 "b" (80), // 9
"r" (o16), // 5 "b" (96), // 10
"r" (o32), // 6 "b" (112) // 11
"r" (o48), // 7 :
"r" (o64), // 8 "cr0",
"r" (o80), // 9 "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"r" (o96), // 10 "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"r" (o112) // 11 "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
); );
} }