Replace naive omatcopy_rt with 4x4 blocked implementation

as suggested by MigMuc in issue 2532
This commit is contained in:
Martin Kroeker
2021-02-22 21:35:42 +01:00
committed by GitHub
parent 86a5f98e4a
commit b1eed27a54

View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
/*****************************************************
* 2014/06/09 Saar
*
* Order rowMajor
* Trans
*
******************************************************/
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i,j;
FLOAT *aptr,*bptr;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
BLASLONG i, j;
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
aptr = a;
if (rows <= 0) return 0;
if (cols <= 0) return 0;
for ( i=0; i<rows ; i++ )
{
bptr = &b[i];
for(j=0; j<cols; j++)
{
bptr[j*ldb] = alpha * aptr[j];
}
aptr += lda;
}
a_offset = a;
b_offset = b;
return(0);
i = (rows >> 2);
if (i > 0) {
do {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 4;
j = (cols >> 2);
if (j > 0) {
do {
/* Column 1 of MAT_B */
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
/* Column 2 of MAT_B */
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
/* Column 3 of MAT_B */
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
/* Column 4 of MAT_B */
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
} // if(j > 0)
if (cols & 2) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
b_offset1 += ldb*2;
}
if (cols & 1) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
}
i--;
} while (i > 0);
}
if (rows & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 2;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
b_offset1 += ldb*2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
}
} // if (rows & 2)
if (rows & 1) {
a_offset1 = a_offset;
a_offset += lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
a_offset1 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
a_offset1 += 2;
b_offset1 += ldb * 2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
}
}
return 0;
}