small optimizations on dgemm_kernel for Piledriver

This commit is contained in:
wernsaar 2013-10-31 11:53:26 +01:00
parent e172b70ea2
commit 5118a7f4d1
3 changed files with 41 additions and 108 deletions

View File

@ -54,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c

View File

@ -28,37 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/********************************************************************* /*********************************************************************
* *
* 2013/10/18 Saar * 2013/10/31 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
* *
* *
* 2013/10/18 Saar * 2013/10/31 Saar
* *
* Parameter: * Parameter:
* UNROLL_M 8 * UNROLL_M 8
* UNROLL_N 2 * UNROLL_N 2
* DGEMM_P 384 * DGEMM_P 768
* DGEMM_Q 168 * DGEMM_Q 168
* DGEMM_R 12288 * DGEMM_R 12288
* A_PR1 256 * A_PR1 512
* B_PR1 256
* *
* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
* *
* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) * 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS)
* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) * 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS)
* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) * 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS)
* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) * 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS)
* *
* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
* *
* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior * 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior
* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior * 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior
* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) * 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS)
* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) * 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS)
* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) * 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS)
* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) * 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS)
* *
*********************************************************************/ *********************************************************************/
@ -168,9 +169,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define A_PR1 256 #define A_PR1 512
#define B_PR1 256 #define B_PR1 256
#define C_PR1 256 #define C_PR1 64
.macro INIT8x3 .macro INIT8x3
vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm4 , %xmm4 , %xmm4
@ -454,9 +455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE8x3 .macro SAVE8x3
vmovddup ALPHA, %xmm0 vmovddup ALPHA, %xmm0
prefetcht0 C_PR1(CO1)
prefetcht0 C_PR1(CO1,LDC)
prefetcht0 C_PR1(CO1,LDC,2)
vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
@ -489,6 +487,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 4 * SIZE(CO1, LDC, 2)
vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2)
prefetcht0 C_PR1(CO1)
prefetcht0 C_PR1(CO1,LDC)
prefetcht0 C_PR1(CO1,LDC,2)
addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO1 # coffset += 8
.endm .endm
@ -1284,6 +1286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmpq $3, %rax cmpq $3, %rax
jl .L6_13 jl .L6_13
prefetcht0 B_PR1(BO)
prefetcht0 B_PR1+64(BO)
prefetcht0 B_PR1+128(BO)
KERNEL8x3_INIT KERNEL8x3_INIT
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
@ -1299,22 +1304,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L6_12: .L6_12:
prefetcht0 B_PR1-24(BO)
prefetcht0 B_PR1+40(BO)
KERNEL8x3_M1 KERNEL8x3_M1
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
KERNEL8x3_M4 KERNEL8x3_M4
KERNEL8x3_M5 KERNEL8x3_M5
prefetcht0 B_PR1+104(BO)
KERNEL8x3_M6 KERNEL8x3_M6
KERNEL8x3_M7 KERNEL8x3_M7
KERNEL8x3_M8 KERNEL8x3_M8
dec %rax dec %rax
//je .L6_12_E
jne .L6_12 jne .L6_12
.L6_12_E: .L6_12_E:
prefetcht0 B_PR1(BO)
prefetcht0 B_PR1+64(BO)
KERNEL8x3_M1 KERNEL8x3_M1
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
@ -1432,31 +1440,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L6_22: .L6_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
je .L6_26 je .L6_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
@ -1548,31 +1550,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L6_32: .L6_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
je .L6_36 je .L6_36
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
@ -1651,31 +1647,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L6_42: .L6_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
je .L6_46 je .L6_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
@ -1753,6 +1743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmpq $3, %rax cmpq $3, %rax
jl .L7_13 jl .L7_13
prefetcht0 B_PR1(BO)
prefetcht0 B_PR1+64(BO)
prefetcht0 B_PR1+128(BO)
KERNEL8x3_INIT KERNEL8x3_INIT
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
@ -1768,22 +1761,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L7_12: .L7_12:
prefetcht0 B_PR1-24(BO)
prefetcht0 B_PR1+40(BO)
KERNEL8x3_M1 KERNEL8x3_M1
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
KERNEL8x3_M4 KERNEL8x3_M4
prefetcht0 B_PR1+104(BO)
KERNEL8x3_M5 KERNEL8x3_M5
KERNEL8x3_M6 KERNEL8x3_M6
KERNEL8x3_M7 KERNEL8x3_M7
KERNEL8x3_M8 KERNEL8x3_M8
dec %rax dec %rax
//je .L7_12_E
jne .L7_12 jne .L7_12
.L7_12_E: .L7_12_E:
prefetcht0 B_PR1(BO)
prefetcht0 B_PR1+64(BO)
KERNEL8x3_M1 KERNEL8x3_M1
KERNEL8x3_M2 KERNEL8x3_M2
KERNEL8x3_M3 KERNEL8x3_M3
@ -1904,31 +1900,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L7_22: .L7_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
je .L7_26 je .L7_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
KERNEL4x3_1(xxx) KERNEL4x3_1(xxx)
KERNEL4x3_2(xxx) KERNEL4x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL4x3_3(xxx) KERNEL4x3_3(xxx)
KERNEL4x3_4(xxx) KERNEL4x3_4(xxx)
@ -2019,31 +2009,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L7_32: .L7_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
je .L7_36 je .L7_36
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
KERNEL2x3_1(xxx) KERNEL2x3_1(xxx)
KERNEL2x3_2(xxx) KERNEL2x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL2x3_3(xxx) KERNEL2x3_3(xxx)
KERNEL2x3_4(xxx) KERNEL2x3_4(xxx)
@ -2127,31 +2111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L7_42: .L7_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
je .L7_46 je .L7_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
//prefetcht0 B_PR1+64(BO,BI,8)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
KERNEL1x3_1(xxx) KERNEL1x3_1(xxx)
KERNEL1x3_2(xxx) KERNEL1x3_2(xxx)
//prefetcht0 B_PR1+32(BO,BI,8)
KERNEL1x3_3(xxx) KERNEL1x3_3(xxx)
KERNEL1x3_4(xxx) KERNEL1x3_4(xxx)
@ -2277,13 +2255,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_12: .L2_12:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
KERNEL8x2_4(xxx) KERNEL8x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
@ -2291,13 +2267,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_16 je .L2_16
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
KERNEL8x2_4(xxx) KERNEL8x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
@ -2399,13 +2373,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_22: .L2_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx) KERNEL4x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
@ -2413,13 +2385,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_26 je .L2_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx) KERNEL4x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
@ -2503,13 +2473,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_32: .L2_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx) KERNEL2x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
@ -2517,13 +2485,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_36 je .L2_36
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx) KERNEL2x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
@ -2600,13 +2566,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_42: .L2_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx) KERNEL1x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
@ -2614,13 +2578,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_46 je .L2_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx) KERNEL1x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
@ -2743,7 +2705,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12: .L1_12:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x1_1(xxx) KERNEL8x1_1(xxx)
KERNEL8x1_2(xxx) KERNEL8x1_2(xxx)
KERNEL8x1_3(xxx) KERNEL8x1_3(xxx)
@ -2756,7 +2717,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_16 je .L1_16
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x1_1(xxx) KERNEL8x1_1(xxx)
KERNEL8x1_2(xxx) KERNEL8x1_2(xxx)
KERNEL8x1_3(xxx) KERNEL8x1_3(xxx)
@ -2851,7 +2811,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_22: .L1_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x1_1(xxx) KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx) KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx) KERNEL4x1_3(xxx)
@ -2864,7 +2823,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_26 je .L1_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x1_1(xxx) KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx) KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx) KERNEL4x1_3(xxx)
@ -2946,7 +2904,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_32: .L1_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x1_1(xxx) KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx) KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx) KERNEL2x1_3(xxx)
@ -3036,7 +2993,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_42: .L1_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x1_1(xxx) KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx) KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx) KERNEL1x1_3(xxx)
@ -3049,7 +3005,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_46 je .L1_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x1_1(xxx) KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx) KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx) KERNEL1x1_3(xxx)
@ -3317,13 +3272,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_12: .L2_12:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
KERNEL8x2_4(xxx) KERNEL8x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
@ -3331,13 +3284,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_16 je .L2_16
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
KERNEL8x2_4(xxx) KERNEL8x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x2_1(xxx) KERNEL8x2_1(xxx)
KERNEL8x2_2(xxx) KERNEL8x2_2(xxx)
KERNEL8x2_3(xxx) KERNEL8x2_3(xxx)
@ -3502,13 +3453,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_22: .L2_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx) KERNEL4x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
@ -3516,13 +3465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_26 je .L2_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
KERNEL4x2_4(xxx) KERNEL4x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x2_1(xxx) KERNEL4x2_1(xxx)
KERNEL4x2_2(xxx) KERNEL4x2_2(xxx)
KERNEL4x2_3(xxx) KERNEL4x2_3(xxx)
@ -3667,13 +3614,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_32: .L2_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx) KERNEL2x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
@ -3681,13 +3626,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_36 je .L2_36
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
KERNEL2x2_4(xxx) KERNEL2x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x2_1(xxx) KERNEL2x2_1(xxx)
KERNEL2x2_2(xxx) KERNEL2x2_2(xxx)
KERNEL2x2_3(xxx) KERNEL2x2_3(xxx)
@ -3818,13 +3761,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L2_42: .L2_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx) KERNEL1x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
@ -3832,13 +3773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L2_46 je .L2_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
KERNEL1x2_4(xxx) KERNEL1x2_4(xxx)
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x2_1(xxx) KERNEL1x2_1(xxx)
KERNEL1x2_2(xxx) KERNEL1x2_2(xxx)
KERNEL1x2_3(xxx) KERNEL1x2_3(xxx)
@ -4023,7 +3962,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12: .L1_12:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x1_1(xxx) KERNEL8x1_1(xxx)
KERNEL8x1_2(xxx) KERNEL8x1_2(xxx)
KERNEL8x1_3(xxx) KERNEL8x1_3(xxx)
@ -4036,7 +3974,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_16 je .L1_16
//prefetcht0 B_PR1(BO,BI,8)
KERNEL8x1_1(xxx) KERNEL8x1_1(xxx)
KERNEL8x1_2(xxx) KERNEL8x1_2(xxx)
KERNEL8x1_3(xxx) KERNEL8x1_3(xxx)
@ -4186,7 +4123,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_22: .L1_22:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x1_1(xxx) KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx) KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx) KERNEL4x1_3(xxx)
@ -4199,7 +4135,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_26 je .L1_26
//prefetcht0 B_PR1(BO,BI,8)
KERNEL4x1_1(xxx) KERNEL4x1_1(xxx)
KERNEL4x1_2(xxx) KERNEL4x1_2(xxx)
KERNEL4x1_3(xxx) KERNEL4x1_3(xxx)
@ -4335,7 +4270,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_32: .L1_32:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL2x1_1(xxx) KERNEL2x1_1(xxx)
KERNEL2x1_2(xxx) KERNEL2x1_2(xxx)
KERNEL2x1_3(xxx) KERNEL2x1_3(xxx)
@ -4476,7 +4410,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_42: .L1_42:
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x1_1(xxx) KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx) KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx) KERNEL1x1_3(xxx)
@ -4489,7 +4422,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
je .L1_46 je .L1_46
//prefetcht0 B_PR1(BO,BI,8)
KERNEL1x1_1(xxx) KERNEL1x1_1(xxx)
KERNEL1x1_2(xxx) KERNEL1x1_2(xxx)
KERNEL1x1_3(xxx) KERNEL1x1_3(xxx)

View File

@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(ARCH_X86_64) #if defined(ARCH_X86_64)
#define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 384 #define DGEMM_DEFAULT_P 768
#define ZGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 384
#define CGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_P 768
#else #else