Fixed the bug about Loongson3A gsLQC1 & gsSQC1 instructions in daxpy kernel. Now daxpy is correct.

This commit is contained in:
Xianyi Zhang 2011-03-18 23:05:56 +00:00
parent 2b8643e0de
commit f405b5bcc5
1 changed files with 140 additions and 89 deletions

View File

@ -228,20 +228,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L11: .L11:
//X & Y algin //X & Y algin
gsLQC1(X_BASE,A2,A1,0*SIZE) gsLQC1(X_BASE,A2,A1,0)
gsLQC1(X_BASE,A4,A3,2*SIZE) gsLQC1(X_BASE,A4,A3,1)
gsLQC1(X_BASE,A6,A5,4*SIZE) gsLQC1(X_BASE,A6,A5,2)
gsLQC1(X_BASE,A8,A7,6*SIZE) gsLQC1(X_BASE,A8,A7,3)
gsLQC1(X_BASE,A10,A9,8*SIZE) gsLQC1(X_BASE,A10,A9,4)
gsLQC1(X_BASE,A12,A11,10*SIZE) gsLQC1(X_BASE,A12,A11,5)
gsLQC1(X_BASE,A14,A13,12*SIZE) gsLQC1(X_BASE,A14,A13,6)
gsLQC1(X_BASE,A16,A15,14*SIZE) gsLQC1(X_BASE,A16,A15,7)
gsLQC1(Y_BASE,B2,B1,0*SIZE) gsLQC1(Y_BASE,B2,B1,0)
gsLQC1(Y_BASE,B4,B3,2*SIZE) gsLQC1(Y_BASE,B4,B3,1)
gsLQC1(Y_BASE,B6,B5,4*SIZE) gsLQC1(Y_BASE,B6,B5,2)
gsLQC1(Y_BASE,B8,B7,6*SIZE) gsLQC1(Y_BASE,B8,B7,3)
blez I, .L13 blez I, .L13
NOP NOP
@ -251,65 +251,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t1, b1, ALPHA, a1 MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2 MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0*SIZE) gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,8*SIZE) gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3 MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4 MADD t4, b4, ALPHA, a4
gsSQC1(Y_BASE, T4, T3, 2*SIZE) gsSQC1(Y_BASE, T4, T3, 1)
gsLQC1(Y_BASE,B4,B3,10*SIZE) gsLQC1(Y_BASE,B4,B3,5)
PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
MADD t1, b5, ALPHA, a5 MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6 MADD t2, b6, ALPHA, a6
gsSQC1(Y_BASE, T2, T1, 4*SIZE) gsSQC1(Y_BASE, T2, T1, 2)
gsLQC1(Y_BASE,B6,B5,12*SIZE) gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7 MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8 MADD t4, b8, ALPHA, a8
gsSQC1(Y_BASE, T4, T3, 6*SIZE) gsSQC1(Y_BASE, T4, T3, 3)
gsLQC1(Y_BASE,B8,B7,14*SIZE) gsLQC1(Y_BASE,B8,B7, 7)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
MADD t1, b1, ALPHA, a9 MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10 MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 8*SIZE) gsSQC1(Y_BASE, T2, T1, 4)
gsLQC1(Y_BASE,B2,B1,16*SIZE) gsLQC1(Y_BASE,B2,B1,8)
MADD t3, b3, ALPHA, a11 MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12 MADD t4, b4, ALPHA, a12
gsSQC1(Y_BASE, T4, T3, 10*SIZE) gsSQC1(Y_BASE, T4, T3, 5)
gsLQC1(Y_BASE,B4,B3,18*SIZE) gsLQC1(Y_BASE,B4,B3,9)
PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
MADD t1, b5, ALPHA, a13 MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14 MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 12*SIZE) gsSQC1(Y_BASE, T2, T1, 6)
gsLQC1(Y_BASE,B6,B5,20*SIZE) gsLQC1(Y_BASE,B6,B5,10)
MADD t3, b7, ALPHA, a15 MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16 MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 14*SIZE) gsSQC1(Y_BASE, T4, T3, 7)
gsLQC1(Y_BASE,B8,B7,22*SIZE) gsLQC1(Y_BASE,B8,B7,11)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
gsLQC1(X_BASE,A2,A1,16*SIZE) gsLQC1(X_BASE,A2,A1,8)
gsLQC1(X_BASE,A4,A3,18*SIZE) gsLQC1(X_BASE,A4,A3,9)
gsLQC1(X_BASE,A6,A5,20*SIZE) gsLQC1(X_BASE,A6,A5,10)
gsLQC1(X_BASE,A8,A7,22*SIZE) gsLQC1(X_BASE,A8,A7,11)
gsLQC1(X_BASE,A10,A9,24*SIZE) gsLQC1(X_BASE,A10,A9,12)
gsLQC1(X_BASE,A12,A11,26*SIZE) gsLQC1(X_BASE,A12,A11,13)
gsLQC1(X_BASE,A14,A13,28*SIZE) gsLQC1(X_BASE,A14,A13,14)
gsLQC1(X_BASE,A16,A15,30*SIZE) gsLQC1(X_BASE,A16,A15,15)
daddiu I, I, -1 daddiu I, I, -1
@ -324,44 +324,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t1, b1, ALPHA, a1 MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2 MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0*SIZE) gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,8*SIZE) gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3 MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4 MADD t4, b4, ALPHA, a4
gsSQC1(Y_BASE, T4, T3, 2*SIZE) gsSQC1(Y_BASE, T4, T3, 1)
gsLQC1(Y_BASE,B4,B3,10*SIZE) gsLQC1(Y_BASE,B4,B3,5)
MADD t1, b5, ALPHA, a5 MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6 MADD t2, b6, ALPHA, a6
gsSQC1(Y_BASE, T2, T1, 4*SIZE) gsSQC1(Y_BASE, T2, T1, 2)
gsLQC1(Y_BASE,B6,B5,12*SIZE) gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7 MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8 MADD t4, b8, ALPHA, a8
gsSQC1(Y_BASE, T4, T3, 6*SIZE) gsSQC1(Y_BASE, T4, T3, 3)
gsLQC1(Y_BASE,B8,B7,14*SIZE) gsLQC1(Y_BASE,B8,B7,7)
MADD t1, b1, ALPHA, a9 MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10 MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 8*SIZE) gsSQC1(Y_BASE, T2, T1, 4)
MADD t3, b3, ALPHA, a11 MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12 MADD t4, b4, ALPHA, a12
gsSQC1(Y_BASE, T4, T3, 10*SIZE) gsSQC1(Y_BASE, T4, T3, 5)
MADD t1, b5, ALPHA, a13 MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14 MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 12*SIZE) gsSQC1(Y_BASE, T2, T1, 6)
MADD t3, b7, ALPHA, a15 MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16 MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 14*SIZE) gsSQC1(Y_BASE, T4, T3, 7)
daddiu X, X, 16 * SIZE daddiu X, X, 16 * SIZE
@ -415,88 +415,90 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//unloop 16 //unloop 16
LD a1, 0 * SIZE(X) LD a1, 0 * SIZE(X)
gsLQC1(X_BASE,A3,A2,1*SIZE) daddiu X, X, SIZE
gsLQC1(X_BASE,A5,A4,3*SIZE) gsLQC1(X_BASE,A3,A2,0)
gsLQC1(X_BASE,A7,A6,5*SIZE) gsLQC1(X_BASE,A5,A4,1)
gsLQC1(X_BASE,A9,A8,7*SIZE) gsLQC1(X_BASE,A7,A6,2)
gsLQC1(X_BASE,A9,A8,3)
gsLQC1(X_BASE,A11,A10,8*SIZE) gsLQC1(X_BASE,A11,A10,4)
gsLQC1(X_BASE,A13,A12,11*SIZE) gsLQC1(X_BASE,A13,A12,5)
gsLQC1(X_BASE,A15,A14,13*SIZE) gsLQC1(X_BASE,A15,A14,6)
LD a16, 15 * SIZE(X) LD a16, 14 * SIZE(X)
gsLQC1(Y_BASE,B2,B1,0*SIZE)
gsLQC1(Y_BASE,B4,B3,2*SIZE)
gsLQC1(Y_BASE,B6,B5,4*SIZE)
gsLQC1(Y_BASE,B8,B7,6*SIZE)
blez I, .L13 gsLQC1(Y_BASE,B2,B1,0)
gsLQC1(Y_BASE,B4,B3,1)
gsLQC1(Y_BASE,B6,B5,2)
gsLQC1(Y_BASE,B8,B7,3)
blez I, .L32
NOP NOP
.align 5 .align 5
.L31: .L31:
MADD t1, b1, ALPHA, a1 MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2 MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0*SIZE) gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,8*SIZE) gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3 MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4 MADD t4, b4, ALPHA, a4
gsSQC1(Y_BASE, T4, T3, 2*SIZE) gsSQC1(Y_BASE, T4, T3, 1)
gsLQC1(Y_BASE,B4,B3,10*SIZE) gsLQC1(Y_BASE,B4,B3,5)
PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
MADD t1, b5, ALPHA, a5 MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6 MADD t2, b6, ALPHA, a6
gsSQC1(Y_BASE, T2, T1, 4*SIZE) gsSQC1(Y_BASE, T2, T1, 2)
gsLQC1(Y_BASE,B6,B5,12*SIZE) gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7 MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8 MADD t4, b8, ALPHA, a8
gsSQC1(Y_BASE, T4, T3, 6*SIZE) gsSQC1(Y_BASE, T4, T3, 3)
gsLQC1(Y_BASE,B8,B7,14*SIZE) gsLQC1(Y_BASE,B8,B7,7)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
MADD t1, b1, ALPHA, a9 MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10 MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 8*SIZE) gsSQC1(Y_BASE, T2, T1, 4)
gsLQC1(Y_BASE,B2,B1,16*SIZE) gsLQC1(Y_BASE,B2,B1,8)
MADD t3, b3, ALPHA, a11 MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12 MADD t4, b4, ALPHA, a12
gsSQC1(Y_BASE, T4, T3, 10*SIZE) gsSQC1(Y_BASE, T4, T3, 5)
gsLQC1(Y_BASE,B4,B3,18*SIZE) gsLQC1(Y_BASE,B4,B3,9)
PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
MADD t1, b5, ALPHA, a13 MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14 MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 12*SIZE) gsSQC1(Y_BASE, T2, T1, 6)
gsLQC1(Y_BASE,B6,B5,20*SIZE) gsLQC1(Y_BASE,B6,B5,10)
MADD t3, b7, ALPHA, a15 MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16 MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 14*SIZE) gsSQC1(Y_BASE, T4, T3, 7)
gsLQC1(Y_BASE,B8,B7,22*SIZE) gsLQC1(Y_BASE,B8,B7,11)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
LD a1, 16 * SIZE(X) LD a1, 15 * SIZE(X)
gsLQC1(X_BASE,A3,A2,17*SIZE) gsLQC1(X_BASE,A3,A2,8)
gsLQC1(X_BASE,A5,A4,19*SIZE) gsLQC1(X_BASE,A5,A4,9)
gsLQC1(X_BASE,A7,A6,21*SIZE) gsLQC1(X_BASE,A7,A6,10)
gsLQC1(X_BASE,A9,A8,23*SIZE) gsLQC1(X_BASE,A9,A8,11)
gsLQC1(X_BASE,A11,A10,25*SIZE) gsLQC1(X_BASE,A11,A10,12)
gsLQC1(X_BASE,A13,A12,27*SIZE) gsLQC1(X_BASE,A13,A12,13)
gsLQC1(X_BASE,A15,A14,29*SIZE) gsLQC1(X_BASE,A15,A14,14)
LD a16, 31 * SIZE(X) LD a16, 30 * SIZE(X)
daddiu I, I, -1 daddiu I, I, -1
daddiu Y, Y, 16 * SIZE daddiu Y, Y, 16 * SIZE
@ -504,8 +506,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
daddiu X, X, 16 * SIZE daddiu X, X, 16 * SIZE
bgtz I, .L31 bgtz I, .L31
//jump back to the remain loop process. .align 5
b .L13 //Loop end:
.L32:
MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4
gsSQC1(Y_BASE, T4, T3, 1)
gsLQC1(Y_BASE,B4,B3,5)
MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6
gsSQC1(Y_BASE, T2, T1, 2)
gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8
gsSQC1(Y_BASE, T4, T3, 3)
gsLQC1(Y_BASE,B8,B7,7)
MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 4)
MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12
gsSQC1(Y_BASE, T4, T3, 5)
MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 6)
MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 7)
daddiu X, X, 15 * SIZE
daddiu Y, Y, 16 * SIZE
//jump back to the remain process.
b .L15
.align 5 .align 5
//INCX!=1 or INCY != 1 //INCX!=1 or INCY != 1