scal benchmark: eliminate y, move init/timing out of loop
Removing y avoids cache effects (if y is the size of the L1 cache, the main array x is removed from it). Moving init and timing out of the loop makes the scal benchmark behave like the gemm benchmark, and allows higher accuracy for smaller test cases since the loop overhead is much smaller than the timing overhead. Example: OPENBLAS_LOOPS=10000 ./dscal.goto 1024 8192 1024 on AMD Zen2 (7532) with 32k (4k doubles) L1 cache per core. Before From : 1024 To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000 SIZE Flops 1024 : 5627.08 MFlops 0.000000 sec 2048 : 5907.34 MFlops 0.000000 sec 3072 : 5553.30 MFlops 0.000001 sec 4096 : 5446.38 MFlops 0.000001 sec 5120 : 5504.61 MFlops 0.000001 sec 6144 : 5501.80 MFlops 0.000001 sec 7168 : 5547.43 MFlops 0.000001 sec 8192 : 5548.46 MFlops 0.000001 sec After From : 1024 To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000 SIZE Flops 1024 : 6310.28 MFlops 0.000000 sec 2048 : 6396.29 MFlops 0.000000 sec 3072 : 6439.14 MFlops 0.000000 sec 4096 : 6327.14 MFlops 0.000001 sec 5120 : 5628.24 MFlops 0.000001 sec 6144 : 5616.41 MFlops 0.000001 sec 7168 : 5553.13 MFlops 0.000001 sec 8192 : 5600.88 MFlops 0.000001 sec We can see the L1->L2 switchover point is now where it should be, and the number of flops for L1 is more accurate.
This commit is contained in:
parent
e9a911fb9f
commit
bae45d94d1
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT *x;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
@ -74,10 +74,6 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef __linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
@ -91,30 +87,20 @@ int main(int argc, char *argv[]){
|
|||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
begin();
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
begin();
|
||||
|
||||
SCAL (&m, alpha, x, &inc_x);
|
||||
}
|
||||
end();
|
||||
|
||||
end();
|
||||
time1 = getsec();
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg = time1 / loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
|
Loading…
Reference in New Issue