Merge pull request #422 from wernsaar/develop

optimization of sandybridge cgemm-kernel
This commit is contained in:
Zhang Xianyi 2014-07-30 17:09:58 +08:00
commit 4f83217df6
2 changed files with 127 additions and 55 deletions

View File

@ -25,6 +25,32 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/*********************************************************************
* 2014/07/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
* 2013/10/28 Saar
* Parameter:
* CGEMM_DEFAULT_UNROLL_N 2
* CGEMM_DEFAULT_UNROLL_M 8
* CGEMM_DEFAULT_P 768
* CGEMM_DEFAULT_Q 512
* A_PR1 512
* B_PR1 512
*
* 2014/07/29 Saar
* Performance at 6192x6192x6192:
* 1 thread: 49 GFLOPS (MKL: 52)
* 2 threads: 99 GFLOPS (MKL: 102)
* 3 threads: 148 GFLOPS (MKL: 150)
* 4 threads: 195 GFLOPS (MKL: 194)
* 8 threads: 354 GFLOPS (MKL: 317)
*
*
*********************************************************************/
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
@ -192,22 +218,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/***************************************************************************************************************************/ /***************************************************************************************************************************/
.macro KERNEL8x2_SUB .macro KERNEL8x2_1
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) prefetcht0 A_PR1(AO, %rax, SIZE)
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+64(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+128(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1
prefetcht0 A_PR1+192(AO, %rax, SIZE)
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
addq $ 16, BI
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
addq $ 64, %rax
.endm
.macro KERNEL8x2_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
addq $ 4 , BI addq $ 4 , BI
addq $ 16, %rax addq $ 16, %rax
.endm .endm
@ -984,47 +1096,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_8_12: .L2_8_12:
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE) prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE) prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16 je .L2_8_16
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE) prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE) prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL8x2_SUB KERNEL8x2_1
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE)
KERNEL8x2_SUB
je .L2_8_16 je .L2_8_16
@ -1152,7 +1236,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_12: .L2_4_12:
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1160,7 +1243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1170,7 +1252,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_16 je .L2_4_16
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1178,7 +1259,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
@ -1305,14 +1385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_22: .L2_4_22:
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1321,14 +1399,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_26 je .L2_4_26
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1507,13 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_4_42: .L2_4_42:
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1522,13 +1596,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_4_46 je .L2_4_46
prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 A_PR1(AO,%rax,SIZE)
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prefetcht0 B_PR1(BO,BI,SIZE)
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB

View File

@ -1134,9 +1134,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r #define QGEMM_DEFAULT_R qgemm_r
#define CGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 768
//#define CGEMM_DEFAULT_R cgemm_r #define CGEMM_DEFAULT_R cgemm_r
#define CGEMM_DEFAULT_R 1024 //#define CGEMM_DEFAULT_R 1024
#define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_R zgemm_r #define ZGEMM_DEFAULT_R zgemm_r
@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 192 #define CGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128