Fixed bug in MOVQ redefine and ALIGN SIZE problem.

This commit is contained in:
wangqian 2012-06-19 20:37:22 +08:00
parent 996dc6d1c8
commit 3ef96aa567
4 changed files with 304 additions and 299 deletions

View File

@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define JMP jmp
#define NOP
#define XOR xorpd
#undef MOVQ
#define MOVQ movq
#define XOR_SY vxorps
@ -305,7 +306,7 @@ movq %r11, kk;
MOVQ bn,j;
SARQ $2,j; # Rn = 4
JLE .L0_loopE;
.align 32;
ALIGN_5;
.L0_bodyB:;
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -320,7 +321,7 @@ MOVQ ba,ptrba;
MOVQ bm,i;
SARQ $3,i; # Rm = 8
JLE .L1_loopE;
.align 32;
ALIGN_5;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -367,7 +368,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2,k; # Unroll 4 times
JLE .L2_loopE;
.align 32;
ALIGN_5;
.L2_bodyB:;
# Computing kernel
@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8;
.L2_bodyE:;
DECQ k;
JG .L2_bodyB;
.align 64;
ALIGN_5
.L2_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
@ -599,7 +600,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L3_loopE;
.align 64
ALIGN_5
.L3_loopB:
######### Unroll 1 ##################
PREFETCH0 PRESIZE*SIZE(ptrba)
@ -717,7 +718,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L4_loopE;
.align 64
ALIGN_5
.L4_loopB:;
######### Unroll 1 ##################
PREFETCH0 PRESIZE*SIZE(ptrba)
@ -875,7 +876,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx;
.align 32
ALIGN_5
EXTRA_SY $1,yvec15,xvec7;
EXTRA_SY $1,yvec14,xvec6;
EXTRA_SY $1,yvec13,xvec5;
@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1;
DECQ i;
JG .L1_bodyB;
JMP .L1_loopE;
.align 32
ALIGN_5
.L4_loopEx:
EXTRA_SY $1, yvec15, xvec7;
#ifndef TRMMKERNEL
@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0;
ADDQ $16*SIZE, C1;
DECQ i;
JG .L1_bodyB;
.align 32;
ALIGN_5;
.L1_loopE:;
TEST $4, bm;
JLE .L5_loopE;
.align 32
ALIGN_5
.L5_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1113,7 +1114,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L8_loopE;
.align 32
ALIGN_5
.L8_bodyB:
#### Unroll times 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L8_bodyB;
.align 32
ALIGN_5
.L8_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1250,7 +1251,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L9_loopE;
.align 32
ALIGN_5
.L9_bodyB:
#### Unroll times 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -1323,7 +1324,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L10_loopE;
.align 32
ALIGN_5
.L10_bodyB:
#### Unroll times 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1;
.L5_loopE:
TEST $2, bm;
JLE .L6_loopE;
.align 32
ALIGN_5
.L6_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1527,7 +1528,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L11_loopE;
.align 32
ALIGN_5
.L11_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L11_bodyB;
.align 32
ALIGN_5
.L11_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1660,7 +1661,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L12_loopE;
.align 32
ALIGN_5
.L12_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
@ -1731,7 +1732,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L13_loopE;
.align 32
ALIGN_5
.L13_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1;
.L6_loopE:
TEST $1, bm;
JLE .L7_loopE;
.align 32
ALIGN_5
.L7_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1905,7 +1906,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L14_loopE;
.align 32
ALIGN_5
.L14_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L14_bodyB;
.align 32
ALIGN_5
.L14_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1986,7 +1987,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L15_loopE;
.align 32
ALIGN_5
.L15_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -2031,7 +2032,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L16_loopE;
.align 32
ALIGN_5
.L16_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C;
.L0_bodyE:;
DECQ j;
JG .L0_bodyB;
.align 32;
ALIGN_5;
.L0_loopE:;
TEST $2, bn;
JLE .L20_loopE;
.align 32
ALIGN_5
.L20_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -2145,7 +2146,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L21_loopE;
.align 32
ALIGN_5
.L21_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2181,7 +2182,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L211_loopE;
.align 32
ALIGN_5
.L211_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L211_bodyB;
.align 32
ALIGN_5
.L211_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2438,7 +2439,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L212_loopE;
.align 32
ALIGN_5
.L212_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -2571,7 +2572,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L213_loopE;
.align 32
ALIGN_5
.L213_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0;
ADDQ $16*SIZE, C1;
DECQ i;
JG .L21_bodyB;
.align 32
ALIGN_5
.L21_loopE:
TEST $4, bm;
JLE .L22_loopE;
.align 32
ALIGN_5
.L22_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2862,7 +2863,7 @@ MOVQ %rax, kkk;
SARQ $2, k;
JLE .L221_loopE;
.align 32
ALIGN_5
.L221_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L221_bodyB;
.align 32
ALIGN_5
.L221_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3010,7 +3011,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L222_loopE;
.align 32
ALIGN_5
.L222_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3089,7 +3090,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L223_loopE;
.align 32
ALIGN_5
.L223_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1;
.L22_loopE:
TEST $2, bm;
JLE .L23_loopE;
.align 32
ALIGN_5
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -3267,7 +3268,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L231_loopE;
.align 32
ALIGN_5
.L231_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L231_bodyB;
.align 32
ALIGN_5
.L231_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3359,7 +3360,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L232_loopE;
.align 32
ALIGN_5
.L232_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3409,7 +3410,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L233_loopE;
.align 32
ALIGN_5
.L233_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1;
.L23_loopE:
TEST $1, bm;
JLE .L24_loopE;
.align 32
ALIGN_5
.L24_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -3532,7 +3533,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L241_loopE;
.align 32
ALIGN_5
.L241_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -3585,7 +3586,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L242_loopE;
.align 32
ALIGN_5
.L242_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -3616,7 +3617,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L243_loopE;
.align 32
ALIGN_5
.L243_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C;
.L20_loopE:
TEST $1, bn;
JLE .L30_loopE;
.align 32
ALIGN_5
.L30_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -3695,7 +3696,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L31_loopE;
.align 32
ALIGN_5
.L31_bodyB:
MOVQ bb, ptrbb;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
@ -3727,7 +3728,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L311_loopE;
.align 32
ALIGN_5
.L311_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L311_bodyB;
.align 32
ALIGN_5
.L311_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3808,7 +3809,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L312_loopE;
.align 32
ALIGN_5
.L312_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -3853,7 +3854,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L313_loopE;
.align 32
ALIGN_5
.L313_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -3941,11 +3942,11 @@ ADDQ $8, kk;
ADDQ $16*SIZE, C0;
DECQ i;
JG .L31_bodyB;
.align 32
ALIGN_5
.L31_loopE:
TEST $4, bm;
JLE .L32_loopE;
.align 32
ALIGN_5
.L32_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -3974,7 +3975,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L321_loopE;
.align 32
ALIGN_5
.L321_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L321_bodyB;
.align 32
ALIGN_5
.L321_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -4031,7 +4032,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L322_loopE;
.align 32
ALIGN_5
.L322_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -4064,7 +4065,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L323_loopE;
.align 32
ALIGN_5
.L323_bodyB:
#### Unroll 1 ####
LD_SY 0*SIZE(ptrba), yvec0;
@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0;
.L32_loopE:
TEST $2, bm;
JLE .L33_loopE;
.align 32
ALIGN_5
.L33_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -4157,7 +4158,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L331_loopE;
.align 32
ALIGN_5
.L331_bodyB:
#### Unroll 1 ####
LD_SX 0*SIZE(ptrba), xvec0;
@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L331_bodyB;
.align 32
ALIGN_5
.L331_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -4210,7 +4211,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L332_loopE;
.align 32
ALIGN_5
.L332_bodyB:
#### Unroll 1 ####
LD_SX 0*SIZE(ptrba), xvec0;
@ -4241,7 +4242,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L333_loopE;
.align 32
ALIGN_5
.L333_bodyB:
#### Unroll 1 ####
LD_SX 0*SIZE(ptrba), xvec0;
@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0;
.L33_loopE:
TEST $1, bm;
JLE .L34_loopE;
.align 32
ALIGN_5
.L34_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -4329,7 +4330,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L341_loopE;
.align 32
ALIGN_5
.L341_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L341_bodyB;
.align 32
ALIGN_5
.L341_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -4362,7 +4363,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L342_loopE;
.align 32
ALIGN_5
.L342_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -4383,7 +4384,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L343_loopE;
.align 32
ALIGN_5
.L343_bodyB:
XOR_SY yvec0, yvec0, yvec0;
XOR_SY yvec2, yvec2, yvec2;

View File

@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define JNE jne
#define NOP
#define XOR xorpd
#undef MOVQ
#define MOVQ movq
#define XOR_SY vxorps
@ -265,7 +266,7 @@ movq %r11, kk
MOVQ bn,j;
SARQ $2,j; # Rn = 4
JLE .L0_loopE;
.align 32;
ALIGN_5;
.L0_bodyB:;
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -281,7 +282,7 @@ MOVQ ba,ptrba;
MOVQ bm,i;
SARQ $3,i; # Rm = 8
JLE .L1_loopE;
.align 32;
ALIGN_5;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -328,7 +329,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2,k;
JLE .L2_loopE;
.align 32;
ALIGN_5;
.L2_bodyB:;
# Computing kernel
@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8;
.L2_bodyE:;
DECQ k;
JG .L2_bodyB;
.align 64;
ALIGN_5
.L2_loopE:;
PREFETCH2 0*SIZE(prebb);
ADDQ $8*SIZE, prebb;
@ -459,7 +460,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L3_loopE;
.align 64
ALIGN_5
.L3_bodyB:
#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
@ -529,7 +530,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L4_loopE;
.align 64
ALIGN_5
.L4_bodyB:;
#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
@ -588,7 +589,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx; # Unalign part write back
.align 32
ALIGN_5
#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec14,xvec6;
@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1;
DECQ i;
JG .L1_bodyB;
JMP .L1_loopE;
.align 32;
ALIGN_5;
.L4_loopEx:;
EXTRA_DY $1, yvec15, xvec7;
#ifndef TRMMKERNEL
@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L1_bodyB;
.align 32
ALIGN_5
.L1_loopE:;
TEST $4, bm; # Rm = 4
JLE .L5_loopE;
.align 32
ALIGN_5
.L5_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -816,7 +817,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L6_loopE;
.align 32;
ALIGN_5;
.L6_bodyB:;
# Computing kernel
@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
DECQ k;
JG .L6_bodyB;
.align 32
ALIGN_5
.L6_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
@ -896,7 +897,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L7_loopE;
.align 32
ALIGN_5
.L7_bodyB:;
#### Untoll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
@ -940,7 +941,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L8_loopE;
.align 32
ALIGN_5
.L8_bodyB:;
#### Untoll time 1 ####
MUL_DY yvec0, yvec2, yvec6;
@ -977,7 +978,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L8_loopEx; # Unalign part write back
.align 32
ALIGN_5
#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec13,xvec5;
@ -1014,7 +1015,7 @@ ADDQ $4, kk
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
JMP .L5_loopE;
.align 32
ALIGN_5
.L8_loopEx:;
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec13,xvec5;
@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1;
.L5_loopE:;
TEST $2, bm;
JLE .L9_loopE;
.align 32
ALIGN_5
.L9_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1117,7 +1118,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L10_loopE;
.align 32;
ALIGN_5;
.L10_bodyB:;
# Computing kernel
@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5;
ADD_DX xvec5, xvec9;
DECQ k;
JG .L10_bodyB;
.align 32
ALIGN_5
.L10_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk
@ -1201,7 +1202,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L11_loopE;
.align 32
ALIGN_5
.L11_bodyB:;
##### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
@ -1248,7 +1249,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L12_loopE;
.align 32
ALIGN_5
.L12_bodyB:;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2;
@ -1285,7 +1286,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L12_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec13;
@ -1310,7 +1311,7 @@ ADDQ $2, kk
ADDQ $2*SIZE, C0
ADDQ $2*SIZE, C1
JMP .L9_loopE;
.align 32
ALIGN_5
.L12_loopEx:
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec14;
@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1;
.L9_loopE:;
TEST $1, bm
JLE .L13_loopE;
.align 32
ALIGN_5
.L13_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1379,7 +1380,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L14_loopE;
.align 32
ALIGN_5
.L14_bodyB:;
BROAD_DY 0*SIZE(ptrba), yvec0;
LD_DY 0*SIZE(ptrbb), yvec2;
@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L14_bodyB;
.align 32
ALIGN_5
.L14_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1413,7 +1414,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L15_loopE;
.align 32
ALIGN_5
.L15_bodyB:
BROAD_DY 0*SIZE(ptrba), yvec0;
LD_DY 0*SIZE(ptrbb), yvec2;
@ -1434,7 +1435,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L16_loopE;
.align 32
ALIGN_5
.L16_bodyB:;
BROAD_DY 0*SIZE(ptrba), yvec0;
LD_DY 0*SIZE(ptrbb), yvec2;
@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C;
.L0_bodyE:;
DECQ j;
JG .L0_bodyB;
.align 32;
ALIGN_5;
.L0_loopE:;
TEST $2, bn;
JLE .L20_loopE;
.align 32;
ALIGN_5;
.L20_loopB:;
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -1501,7 +1502,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i; # Rm = 8
JLE .L21_loopE;
.align 32;
ALIGN_5;
.L21_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1538,7 +1539,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L211_loopE;
.align 32;
ALIGN_5;
.L211_bodyB:
# Computing kernel
#### Unroll time 1 ####
@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7;
ADD_DX xvec7, xvec8;
DECQ k;
JG .L211_bodyB;
.align 32
ALIGN_5
.L211_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1701,7 +1702,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L212_loopE;
.align 32;
ALIGN_5;
.L212_bodyB:
# Computing kernel
#### Unroll time 1 ####
@ -1788,7 +1789,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L213_loopE;
.align 32
ALIGN_5
.L213_bodyB:
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
@ -1858,7 +1859,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L213_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11;
@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1;
DECQ i;
JG .L21_bodyB;
JMP .L21_loopE;
.align 32
ALIGN_5
.L213_loopEx:;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0;
@ -1956,7 +1957,7 @@ JG .L21_bodyB;
.L21_loopE:;
TEST $4, bm; # Rm = 4
JLE .L22_loopE;
.align 32;
ALIGN_5;
.L22_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1989,7 +1990,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L221_loopE;
.align 32
ALIGN_5
.L221_bodyB:;
# Computing kernel
#### Unroll time 1 ####
@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5;
ADD_DX xvec5, xvec10;
DECQ k;
JG .L221_bodyB;
.align 32
ALIGN_5
.L221_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2080,7 +2081,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L222_loopE;
.align 32
ALIGN_5
.L222_bodyB:
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
@ -2129,7 +2130,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L223_loopE;
.align 32
ALIGN_5
.L223_bodyB:
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
@ -2171,7 +2172,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L223_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11;
@ -2196,7 +2197,7 @@ ADDQ $4, kk
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
JMP .L22_loopE;
.align 32
ALIGN_5
.L223_loopEx:;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0;
@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1;
.L22_loopE:;
TEST $2, bm; # Rm = 2
JLE .L23_loopE;
.align 32;
ALIGN_5;
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2267,7 +2268,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L231_loopE;
.align 32
ALIGN_5
.L231_bodyB:
# Computing kernel
#### Unroll time 1 ####
@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L231_bodyB;
.align 32
ALIGN_5
.L231_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2318,7 +2319,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L232_loopE;
.align 32
ALIGN_5
.L232_bodyB:
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
@ -2347,7 +2348,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L233_loopE;
.align 32
ALIGN_5
.L233_bodyB:
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
@ -2373,7 +2374,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L233_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11;
@ -2394,7 +2395,7 @@ ADDQ $2, kk;
ADDQ $2*SIZE, C0;
ADDQ $2*SIZE, C1;
JMP .L23_loopE;
.align 32
ALIGN_5
.L233_loopEx:;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0;
@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1;
.L23_loopE:
TEST $1, bm; # Rm = 1
JLE .L24_loopE;
.align 32;
ALIGN_5;
.L24_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2454,7 +2455,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L241_loopE;
.align 32
ALIGN_5
.L241_bodyB:
BROAD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec2;
@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L241_bodyB;
.align 32
ALIGN_5
.L241_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2488,7 +2489,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L242_loopE;
.align 32
ALIGN_5
.L242_bodyB:
BROAD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec2;
@ -2509,7 +2510,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L243_loopE;
.align 32
ALIGN_5
.L243_bodyB:
BROAD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec2;
@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C;
.L20_loopE:;
TEST $1, bn; # Rn = 1
JLE .L30_loopE;
.align 32
ALIGN_5
.L30_bodyB:
#if defined(TRMMKERNEL)&&defined(LEFT)
MOVQ OFFSET, %rax;
@ -2562,7 +2563,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L31_loopE;
.align 32
ALIGN_5
.L31_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2593,7 +2594,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L311_loopE;
.align 32
ALIGN_5
.L311_bodyB:
#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14;
ADDQ $4*SIZE, ptrbb;
DECQ k;
JG .L311_bodyB;
.align 32
ALIGN_5
.L311_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2643,7 +2644,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L312_loopE;
.align 32
ALIGN_5
.L312_bodyB:
#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
@ -2673,7 +2674,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L313_loopE;
.align 32
ALIGN_5
.L313_bodyB:
#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
@ -2696,7 +2697,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L313_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
EXTRA_DY $1, yvec15, xvec13;
EXTRA_DY $1, yvec14, xvec12;
@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0;
DECQ i;
JG .L31_bodyB;
JMP .L31_loopE;
.align 32
ALIGN_5
.L313_loopEx:
EXTRA_DY $1, yvec15, xvec13;
EXTRA_DY $1, yvec14, xvec12;
@ -2766,7 +2767,7 @@ JG .L31_bodyB;
.L31_loopE:
TEST $4, bm
JLE .L32_loopE;
.align 32
ALIGN_5
.L32_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2796,7 +2797,7 @@ MOVQ %rax, kkk
#endif
SARQ $2, k;
JLE .L321_loopE;
.align 32
ALIGN_5
.L321_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec1;
@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
DECQ k;
JG .L321_bodyB;
.align 32
ALIGN_5
.L321_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2830,7 +2831,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L322_loopE;
.align 32
ALIGN_5
.L322_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec1;
@ -2852,7 +2853,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L323_loopE;
.align 32
ALIGN_5
.L323_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec1;
@ -2870,7 +2871,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L323_loopEx;
.align 32
ALIGN_5
#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
#ifndef TRMMKERNEL
@ -2891,7 +2892,7 @@ ADDQ $4, kk
#endif
ADDQ $4*SIZE, C0;
JMP .L32_loopE;
.align 32
ALIGN_5
.L323_loopEx:
#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0;
.L32_loopE:
TEST $2, bm
JLE .L33_loopE;
.align 32
ALIGN_5
.L33_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
@ -2951,7 +2952,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L331_loopE;
.align 32
ALIGN_5
.L331_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;
@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
DECQ k;
JG .L331_bodyB;
.align 32
ALIGN_5
.L331_loopE:
#ifndef TRMMKERNEL
TEST $2,bk;
@ -2985,7 +2986,7 @@ MOVQ kkk, %rax;
TEST $2, %rax
#endif
JLE .L332_loopE;
.align 32
ALIGN_5
.L332_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;
@ -3006,7 +3007,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L333_loopE;
.align 32
ALIGN_5
.L333_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;
@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0;
.L33_loopE:
TEST $1, bm
JLE .L34_loopE;
.align 32
ALIGN_5
.L34_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
@ -3068,7 +3069,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L341_loopE;
.align 32
ALIGN_5
.L341_bodyB:
movsd 0*SIZE(ptrba), xvec0;
movsd 0*SIZE(ptrbb), xvec1;
@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba;
addq $4*SIZE, ptrbb;
decq k;
JG .L341_bodyB;
.align 32
ALIGN_5
.L341_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3102,7 +3103,7 @@ MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L342_loopE;
.align 32
ALIGN_5
.L342_bodyB:
movsd 0*SIZE(ptrba), xvec0;
movsd 0*SIZE(ptrbb), xvec1;
@ -3124,7 +3125,7 @@ MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L343_loopE;
.align 32
ALIGN_5
.L343_bodyB:
movsd 0*SIZE(ptrba), xvec0;
movsd 0*SIZE(ptrbb), xvec1;

View File

@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define JMP jmp
#define NOP
#define XOR xorpd
#undef MOVQ
#define MOVQ movq
#define XOR_SY vxorps
@ -273,7 +274,7 @@ movq %r11, kk
MOVQ bn,j;
SARQ $3,j;
JLE .L0_loopE;
.align 16;
ALIGN_4;
.L0_bodyB:;
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -289,7 +290,7 @@ MOVQ ba,ptrba;
MOVQ bm,i;
SARQ $3,i;
JLE .L1_loopE;
.align 16;
ALIGN_4;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -342,7 +343,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2,k;
JLE .L2_loopE;
.align 16;
ALIGN_4;
.L2_bodyB:;
# Computing kernel
@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8;
.L2_bodyE:;
DECQ k;
JG .L2_bodyB;
.align 64;
ALIGN_4
.L2_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
@ -480,7 +481,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L3_loopE;
.align 64
ALIGN_4
.L3_loobB:
#### Unroll times 1 ####
MUL_SY yvec0, yvec2, yvec6;
@ -550,7 +551,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L4_loopE;
.align 64
ALIGN_4
.L4_loopB:;
#### Unroll times 1 ####
MUL_SY yvec0, yvec2, yvec6;
@ -609,7 +610,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx;
.align 16
ALIGN_4
LEAQ (ldc,ldc,2),%rax;
EXTRA_SY $1,yvec15,xvec7;
EXTRA_SY $1,yvec14,xvec6;
@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1;
DECQ i;
JG .L1_bodyB;
JMP .L1_loopE;
.align 16;
ALIGN_4;
.L4_loopEx:
LEAQ (ldc,ldc,2),%rax;
EXTRA_SY $1, yvec15, xvec7;
@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L1_bodyB;
.align 16
ALIGN_4
.L1_loopE:;
TEST $4, bm;
JLE .L5_loopE;
.align 16
ALIGN_4
.L5_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -857,7 +858,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L8_loopE;
.align 16
ALIGN_4
.L8_bodyB:
#### Unroll time 1 ####
@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5;
ADD_SX xvec5, xvec8;
DECQ k;
JG .L8_bodyB;
.align 16
ALIGN_4
.L8_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -991,7 +992,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L9_loopE;
.align 16
ALIGN_4
.L9_bodyB:
#### Unroll time 1 ####
SHUF_SX $0x4e, xvec2, xvec4;
@ -1062,7 +1063,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L10_loopE;
.align 16
ALIGN_4
.L10_bodyB:
#### Unroll time 1 ####
SHUF_SX $0x4e, xvec2, xvec4;
@ -1122,7 +1123,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L10_loopEx;
.align 16
ALIGN_4
LEAQ (ldc,ldc,2),%rax;
#ifndef TRMMKERNEL
ADD_SX 0*SIZE(C0), xvec15;
@ -1155,7 +1156,7 @@ ADDQ $4, kk
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
JMP .L5_loopE;
.align 16
ALIGN_4
.L10_loopEx:
LEAQ (ldc,ldc,2),%rax;
#ifndef TRMMKERNEL
@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1;
.L5_loopE:
TEST $2, bm;
JLE .L6_loopE;
.align 16
ALIGN_4
.L6_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1249,7 +1250,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L11_loopE;
.align 16
ALIGN_4
.L11_bodyB:
#### Computing kernel
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L11_bodyB;
.align 16
ALIGN_4
.L11_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1326,7 +1327,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L12_loopE;
.align 16
ALIGN_4
.L12_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
@ -1368,7 +1369,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L13_loopE;
.align 16
ALIGN_4
.L13_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1;
.L6_loopE:
TEST $1, bm;
JLE .L7_loopE;
.align 16
ALIGN_4
.L7_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1465,7 +1466,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L14_loopE;
.align 16
ALIGN_4
.L14_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L14_bodyB;
.align 16
ALIGN_4
.L14_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1511,7 +1512,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L15_loopE;
.align 16
ALIGN_4
.L15_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -1538,7 +1539,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L16_loopE;
.align 16
ALIGN_4
.L16_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C;
.L0_bodyE:;
DECQ j;
JG .L0_bodyB;
.align 16;
ALIGN_4;
.L0_loopE:;
TEST $4, bn; # Rn = 4
JLE .L20_loopE;
.align 16;
ALIGN_4;
.L20_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -1628,7 +1629,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L21_loopE;
.align 16
ALIGN_4
.L21_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -1668,7 +1669,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2,k;
JLE .L211_loopE;
.align 16
ALIGN_4
.L211_bodyB:
#### Unroll time 1 ####
ODUP_SX 0*SIZE(ptrbb), xvec3;
@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8;
LD_SX 4*SIZE(ptrba), xvec1;
DECQ k;
JG .L211_bodyB;
.align 16
ALIGN_4
.L211_loopE:
#ifndef TRMMKERNEL
TEST $2, bk
@ -1808,7 +1809,7 @@ TEST $2, bk
TEST $2, kkk;
#endif
JLE .L212_loopE;
.align 16
ALIGN_4
.L212_bodyB:
#### Unroll time 1 ####
ODUP_SX 0*SIZE(ptrbb), xvec3;
@ -1882,7 +1883,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L213_loopE;
.align 16
ALIGN_4
.L213_bodyB:
ODUP_SX 0*SIZE(ptrbb), xvec3;
SHUF_SX $0x4e, xvec2, xvec4;
@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L21_bodyB;
.align 16
ALIGN_4
.L21_loopE:
TEST $4, bm;
JLE .L22_loopE;
.align 16
ALIGN_4
.L22_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2019,7 +2020,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L221_loopE;
.align 16
ALIGN_4
.L221_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec2;
@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L221_bodyB;
.align 16
ALIGN_4
.L221_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2097,7 +2098,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L222_loopE;
.align 16
ALIGN_4
.L222_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec2;
@ -2139,7 +2140,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L223_loopE;
.align 16
ALIGN_4
.L223_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec2;
@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1;
.L22_loopE:
TEST $2, bm;
JLE .L23_loopE;
.align 16
ALIGN_4
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2234,7 +2235,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L231_loopE;
.align 16
ALIGN_4
.L231_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec4;
@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L231_bodyB;
.align 16
ALIGN_4
.L231_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2282,7 +2283,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L232_loopE;
.align 16
ALIGN_4
.L232_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec4;
@ -2310,7 +2311,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L233_loopE;
.align 16
ALIGN_4
.L233_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
EDUP_SX 0*SIZE(ptrbb), xvec4;
@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1;
.L23_loopE:
TEST $1, bm;
JLE .L24_loopE;
.align 16
ALIGN_4
.L24_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2386,7 +2387,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L241_loopE;
.align 16
ALIGN_4
.L241_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec1;
@ -2419,7 +2420,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L242_loopE;
.align 16
ALIGN_4
.L242_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec1;
@ -2440,7 +2441,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L243_loopE;
.align 16;
ALIGN_4;
.L243_bodyB:
BROAD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec1;
@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C;
.L20_loopE:
TEST $2, bn;
JLE .L30_loopE;
.align 16
ALIGN_4
.L30_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -2503,7 +2504,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L31_loopE;
.align 16
ALIGN_4
.L31_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2536,7 +2537,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L311_loopE;
.align 16
ALIGN_4
.L311_bodyB:
LD_SX 0*SIZE(ptrbb), xvec2;
SHUF_SX $0x50, xvec2, xvec3;
@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L311_bodyB;
.align 16
ALIGN_4
.L311_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2620,7 +2621,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L312_loopE;
.align 16
ALIGN_4
.L312_bodyB:
LD_SX 0*SIZE(ptrbb), xvec2;
SHUF_SX $0x50, xvec2, xvec3;
@ -2666,7 +2667,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L313_loopE;
.align 16
ALIGN_4
.L313_bodyB:
LD_SX 0*SIZE(ptrbb), xvec2;
SHUF_SX $0x50, xvec2, xvec3;
@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L31_bodyB;
.align 16
ALIGN_4
.L31_loopE:
TEST $4, bm;
JLE .L32_loopE;
.align 16
ALIGN_4
.L32_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2766,7 +2767,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L321_loopE;
.align 16
ALIGN_4
.L321_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L321_bodyB;
.align 16
ALIGN_4
.L321_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2814,7 +2815,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L322_loopE;
.align 16
ALIGN_4
.L322_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -2842,7 +2843,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L323_loopE;
.align 16
ALIGN_4
.L323_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
LD_SX 0*SIZE(ptrbb), xvec2;
@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1;
.L32_loopE:
TEST $2, bm;
JLE .L33_loopE;
.align 16
ALIGN_4
.L33_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -2920,7 +2921,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L331_loopE;
.align 16
ALIGN_4
.L331_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L331_bodyB;
.align 16
ALIGN_4
.L331_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2951,7 +2952,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L332_loopE;
.align 16
ALIGN_4
.L332_bodyB:
LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
@ -2972,7 +2973,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L333_loopE;
.align 16
ALIGN_4
.L333_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 1*SIZE(ptrba), xvec1;
@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1;
.L33_loopE:
TEST $1, bm;
JLE .L34_loopE;
.align 16
ALIGN_4
.L34_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -3062,7 +3063,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L341_loopE;
.align 16
ALIGN_4
.L341_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;
@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba;
addq $8*SIZE, ptrbb;
decq k;
jg .L341_bodyB;
.align 16
ALIGN_4
.L341_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3112,7 +3113,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L342_loopE;
.align 16
ALIGN_4
.L342_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;
@ -3140,7 +3141,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L343_loopE;
.align 16
ALIGN_4
.L343_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;
@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C;
.L30_loopE:
TEST $1, bn;
JLE .L40_loopE;
.align 16
ALIGN_4
.L40_bodyB:
#if defined(TRMMKERNEL)&&defined(LEFT)
MOVQ OFFSET, %rax;
@ -3200,7 +3201,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $3, i;
JLE .L41_loopE;
.align 16
ALIGN_4
.L41_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -3230,7 +3231,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L411_loopE;
.align 16
ALIGN_4
.L411_bodyB:
LD_SY 0*SIZE(ptrba), yvec0;
BROAD_SY 0*SIZE(ptrbb), yvec1;
@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
DECQ k;
JG .L411_bodyB;
.align 16
ALIGN_4
.L411_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3264,7 +3265,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L412_loopE;
.align 16
ALIGN_4
.L412_bodyB:
LD_SY 0*SIZE(ptrba), yvec0;
BROAD_SY 0*SIZE(ptrbb), yvec1;
@ -3285,7 +3286,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L413_loopE;
.align 16
ALIGN_4
.L413_bodyB:
LD_SY 0*SIZE(ptrba), yvec0;
BROAD_SY 0*SIZE(ptrbb), yvec1;
@ -3329,11 +3330,11 @@ ADDQ $8, kk;
ADDQ $8*SIZE, C0;
DECQ i;
JG .L41_bodyB;
.align 16
ALIGN_4
.L41_loopE:
TEST $4, bm;
JLE .L42_loopE;
.align 16
ALIGN_4
.L42_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
@ -3362,7 +3363,7 @@ MOVQ %rax, kkk
#endif
SARQ $2, k;
JLE .L421_loopE;
.align 16
ALIGN_4
.L421_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
BROAD_SX 0*SIZE(ptrbb), xvec1;
@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
DECQ k;
JG .L421_bodyB;
.align 16
ALIGN_4
.L421_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3395,7 +3396,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L422_loopE;
.align 16
ALIGN_4
.L422_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
BROAD_SX 0*SIZE(ptrbb), xvec1;
@ -3416,7 +3417,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L423_loopE;
.align 16
ALIGN_4
.L423_bodyB:
LD_SX 0*SIZE(ptrba), xvec0;
BROAD_SX 0*SIZE(ptrbb), xvec1;
@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0;
.L42_loopE:
TEST $2, bm;
JLE .L43_loopE;
.align 16
ALIGN_4
.L43_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
@ -3481,7 +3482,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L431_loopE;
.align 16
ALIGN_4
.L431_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 1*SIZE(ptrba), xvec1;
@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba;
addq $4*SIZE, ptrbb;
decq k;
JG .L431_bodyB;
.align 16
ALIGN_4
.L431_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3526,7 +3527,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L432_loopE;
.align 16
ALIGN_4
.L432_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 1*SIZE(ptrba), xvec1;
@ -3553,7 +3554,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L433_loopE;
.align 16
ALIGN_4
.L433_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 1*SIZE(ptrba), xvec1;
@ -3592,7 +3593,7 @@ addq $2*SIZE, C0;
.L43_loopE:
TEST $1, bm;
JLE .L44_loopE;
.align 16
ALIGN_4
.L44_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
@ -3621,7 +3622,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L441_loopE;
.align 16
ALIGN_4
.L441_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;
@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba;
addq $4*SIZE, ptrbb;
decq k;
JG .L441_bodyB;
.align 16
ALIGN_4
.L441_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3654,7 +3655,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L442_loopE;
.align 16
ALIGN_4
.L442_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;
@ -3675,7 +3676,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L443_loopE;
.align 16
ALIGN_4
.L443_bodyB:
movss 0*SIZE(ptrba), xvec0;
movss 0*SIZE(ptrbb), xvec1;

View File

@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define JMP jmp
#define NOP
#define XOR xorpd
#undef MOVQ
#define MOVQ movq
#define XOR_SY vxorps
#define XOR_DY vxorpd
@ -297,7 +299,7 @@ movq %r11, kk;
MOVQ bn,j;
SARQ $2,j; # Rn = 4
JLE .L0_loopE;
.align 32;
ALIGN_5;
.L0_bodyB:;
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -312,7 +314,7 @@ MOVQ ba,ptrba;
MOVQ bm,i;
SARQ $2,i; # Rm = 4
JLE .L1_loopE;
.align 32;
ALIGN_5;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -361,7 +363,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2,k; # Unroll 4 times
JLE .L2_loopE;
.align 32;
ALIGN_5;
.L2_bodyB:;
#### Computing kernel ####
@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12;
ADD2_DY yvec7, yvec8, yvec8;
DECQ k;
JG .L2_bodyB;
.align 64;
ALIGN_5
.L2_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
@ -592,7 +594,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L3_loopE;
.align 64
ALIGN_5
.L3_bodyB:
#### Unroll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
@ -710,7 +712,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L4_loopE;
.align 64
ALIGN_5
.L4_loopB:;
#### Unroll time 1 ####
PREFETCH0 PRESIZE*SIZE(ptrba);
@ -852,7 +854,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx;
.align 32
ALIGN_5
#### Store Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec14,xvec6;
@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1;
DECQ i;
JG .L1_bodyB;
JMP .L1_loopE;
.align 32
ALIGN_5
.L4_loopEx:
EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L1_bodyB;
.align 32;
ALIGN_5;
.L1_loopE:;
TEST $2, bm;
JLE .L5_loopE;
.align 32
ALIGN_5
.L5_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1060,7 +1062,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L7_loopE;
.align 32
ALIGN_5
.L7_bodyB:
#### Compute kernel ####
#### Unroll times 1 ####
@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L7_bodyB;
.align 32
ALIGN_5
.L7_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1202,7 +1204,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L8_loopE;
.align 32
ALIGN_5
.L8_bodyB:
#### Unroll times 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
@ -1276,7 +1278,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L9_loopE;
.align 32
ALIGN_5
.L9_bodyB:
#### Unroll times 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
@ -1364,7 +1366,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L9_loopEx;
.align 32
ALIGN_5
#### Writing back ####
EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
@ -1401,7 +1403,7 @@ ADDQ $2, kk;
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
JMP .L5_loopE;
.align 32
ALIGN_5
.L9_loopEx:
EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1;
.L5_loopE:
TEST $1, bm;
JLE .L6_loopE;
.align 32
ALIGN_5
.L6_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1496,7 +1498,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L10_loopE;
.align 32
ALIGN_5
.L10_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $32*SIZE, ptrbb;
DECQ k;
JG .L10_bodyB;
.align 32
ALIGN_5
.L10_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1578,7 +1580,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L11_loopE;
.align 32
ALIGN_5
.L11_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -1624,7 +1626,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L12_loopE;
.align 32
ALIGN_5
.L12_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C;
.L0_bodyE:;
DECQ j;
JG .L0_bodyB;
.align 32;
ALIGN_5;
.L0_loopE:;
TEST $2, bn;
JLE .L20_loopE;
.align 32
ALIGN_5
.L20_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -1738,7 +1740,7 @@ MOVQ ba, ptrba;
MOVQ bm, i;
SARQ $2, i;
JLE .L21_loopE;
.align 32
ALIGN_5
.L21_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -1770,7 +1772,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L211_loopE;
.align 32
ALIGN_5
.L211_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12;
ADDQ $32*SIZE, ptrba;
DECQ k;
JG .L211_bodyB;
.align 32
ALIGN_5
.L211_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -1899,7 +1901,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L212_loopE;
.align 32
ALIGN_5
.L212_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -1969,7 +1971,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L213_loopE;
.align 32
ALIGN_5
.L213_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2058,7 +2060,7 @@ MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L213_loopEx;
.align 32
ALIGN_5
#### Writing back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0),xvec15;
@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1;
DECQ i;
JG .L21_bodyB;
JMP .L21_loopE;
.align 32
ALIGN_5
.L213_loopEx:
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0;
@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0;
ADDQ $8*SIZE, C1;
DECQ i;
JG .L21_bodyB;
.align 32
ALIGN_5
.L21_loopE:
TEST $2, bm;
JLE .L22_loopE;
.align 32
ALIGN_5
.L22_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2187,7 +2189,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L221_loopE;
.align 32
ALIGN_5
.L221_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13;
ADDQ $16*SIZE, ptrba;
DECQ k;
JG .L221_bodyB;
.align 32
ALIGN_5
.L221_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2276,7 +2278,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L222_loopE;
.align 32
ALIGN_5
.L222_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2325,7 +2327,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L223_loopE;
.align 32
ALIGN_5
.L223_bodyB:
#### Unroll time 1 ####
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1;
.L22_loopE:
TEST $1, bm;
JLE .L23_loopE;
.align 32
ALIGN_5
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2448,7 +2450,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L231_loopE;
.align 32
ALIGN_5
.L231_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $16*SIZE, ptrbb;
DECQ k;
JG .L231_bodyB;
.align 32
ALIGN_5
.L231_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2506,7 +2508,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L232_loopE;
.align 32
ALIGN_5
.L232_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2540,7 +2542,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L233_loopE;
.align 32
ALIGN_5
.L233_bodyB:
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
EDUP_DY 0*SIZE(ptrbb), yvec2;
@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C;
.L20_loopE:
TEST $1, bn;
JLE .L30_loopE;
.align 32
ALIGN_5
.L30_bodyB:
#if defined(TRMMKERNEL) && defined(LEFT)
MOVQ OFFSET, %rax;
@ -2625,7 +2627,7 @@ MOVQ C, C0;
MOVQ bm, i;
SARQ $2, i;
JLE .L31_loopE;
.align 32
ALIGN_5
.L31_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2655,7 +2657,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L311_loopE;
.align 32
ALIGN_5
.L311_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L311_bodyB;
.align 32
ALIGN_5
.L311_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2740,7 +2742,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L312_loopE;
.align 32
ALIGN_5
.L312_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -2787,7 +2789,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L313_loopE;
.align 32
ALIGN_5
.L313_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -2877,11 +2879,11 @@ ADDQ $4, kk;
ADDQ $8*SIZE, C0;
DECQ i;
JG .L31_bodyB;
.align 32
ALIGN_5
.L31_loopE:
TEST $2, bm;
JLE .L32_loopE;
.align 32
ALIGN_5
.L32_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -2910,7 +2912,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L321_loopE;
.align 32
ALIGN_5
.L321_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L321_bodyB;
.align 32
ALIGN_5
.L321_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -2959,7 +2961,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L322_loopE;
.align 32
ALIGN_5
.L322_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -2988,7 +2990,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L323_loopE;
.align 32
ALIGN_5
.L323_bodyB:
LD_DY 0*SIZE(ptrba), yvec0;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0;
.L32_loopE:
TEST $1, bm;
JLE .L33_loopE;
.align 32
ALIGN_5
.L33_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb,ptrbb;
@ -3078,7 +3080,7 @@ MOVQ %rax, kkk;
#endif
SARQ $2, k;
JLE .L331_loopE;
.align 32
ALIGN_5
.L331_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;
@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba;
ADDQ $8*SIZE, ptrbb;
DECQ k;
JG .L331_bodyB;
.align 32
ALIGN_5
.L331_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
@ -3131,7 +3133,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L332_loopE;
.align 32
ALIGN_5
.L332_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;
@ -3162,7 +3164,7 @@ TEST $1, bk;
TEST $1, kkk;
#endif
JLE .L333_loopE;
.align 32
ALIGN_5
.L333_bodyB:
LD_DX 0*SIZE(ptrba), xvec0;
BROAD_DX 0*SIZE(ptrbb), xvec2;