Merge pull request #1371 from martin-frbg/develop

Add trivially optimized DSDOT for POWER8
This commit is contained in:
Martin Kroeker 2017-11-29 19:55:21 +01:00 committed by GitHub
commit db00a51e6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 9 deletions

View File

@ -122,6 +122,7 @@ ZCOPYKERNEL = zcopy.c
# #
SDOTKERNEL = sdot.c SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c DDOTKERNEL = ddot.c
DSDOTKERNEL = sdot.c
#CDOTKERNEL = ../arm/zdot.c #CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = zdot.c ZDOTKERNEL = zdot.c
# #

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -66,42 +66,76 @@ static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
#endif #endif
#if defined (DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
double dot = 0.0 ;
FLOAT dot = 0.0 ; #if defined (DSDOT)
double mydot = 0.0;
FLOAT asmdot = 0.0;
#else
FLOAT mydot=0.0;
#endif
BLASLONG n1;
if ( n <= 0 ) return(dot); if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
BLASLONG n1 = n & -32; n1 = n & (BLASLONG)(-32);
if ( n1 ) if ( n1 )
dot = sdot_kernel_16(n1, x, y); #if defined(DSDOT)
{
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG n2 = 32;
while (i<n1) {
asmdot = sdot_kernel_16(n2, x1, y1);
mydot += (double)asmdot;
asmdot=0.;
x1+=32;
y1+=32;
i+=32;
}
}
#else
mydot = sdot_kernel_16(n1, x, y);
#endif
i = n1; i = n1;
while(i < n) while(i < n)
{ {
#if defined(DSDOT)
dot += (double)y[i] * (double)x[i] ;
#else
dot += y[i] * x[i] ; dot += y[i] * x[i] ;
#endif
i++ ; i++ ;
} }
dot+=mydot;
return(dot); return(dot);
} }
BLASLONG n1 = n & -2; n1 = n & (BLASLONG)(-2);
while(i < n1) while(i < n1)
{ {
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
#else
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
#endif
ix += inc_x*2 ; ix += inc_x*2 ;
iy += inc_y*2 ; iy += inc_y*2 ;
i+=2 ; i+=2 ;
@ -110,8 +144,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n) while(i < n)
{ {
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix] ; dot += y[iy] * x[ix] ;
#endif
ix += inc_x ; ix += inc_x ;
iy += inc_y ; iy += inc_y ;
i++ ; i++ ;