From a6b775188177ea5cae240aa6ff8e44748b4232dc Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Tue, 30 Jul 2024 15:14:05 +0000 Subject: [PATCH 01/15] BUG: Allow tests to be run multiple times Without failures due to existing files --- test/cblat3_3m.f | 5 ++--- test/zblat3_3m.f | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f index 19f7830be..b61fca53c 100644 --- a/test/cblat3_3m.f +++ b/test/cblat3_3m.f @@ -104,7 +104,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +113,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -3439,4 +3439,3 @@ * End of XERBLA * END - diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index bac23aa54..86977d674 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -105,7 +105,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -114,7 +114,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI From 722e4ae07a8adff3be55c29a5452fb1851c724a8 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Tue, 30 Jul 2024 15:24:23 +0000 Subject: [PATCH 02/15] MAINT: Explicitly replace instead of unknown --- test/cblat3_3m.f | 4 ++-- test/zblat3_3m.f | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f index b61fca53c..6c7d7e169 100644 --- a/test/cblat3_3m.f +++ b/test/cblat3_3m.f @@ -104,7 +104,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +113,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index 86977d674..e65c72b48 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -105,7 +105,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -114,7 +114,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI From d9f368dfe6a9e96807d3860b96d9b30471583dc9 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Mon, 29 Jul 2024 03:51:21 +0000 Subject: [PATCH 03/15] TST: Signal abort for ctest failures correctly --- ctest/c_cblat1.f | 11 +++++++---- ctest/c_cblat2.f | 12 +++++++----- ctest/c_cblat3.f | 16 +++++++++------- ctest/c_cblat3_3m.f | 16 +++++++++------- ctest/c_dblat1.f | 15 +++++++++------ ctest/c_dblat2.f | 12 +++++++----- ctest/c_dblat3.f | 16 +++++++++------- ctest/c_sblat1.f | 17 ++++++++++------- ctest/c_sblat2.f | 12 +++++++----- ctest/c_sblat3.f | 16 +++++++++------- ctest/c_zblat1.f | 11 +++++++---- ctest/c_zblat2.f | 12 +++++++----- ctest/c_zblat3.f | 16 +++++++++------- ctest/c_zblat3_3m.f | 16 +++++++++------- 14 files changed, 115 insertions(+), 83 deletions(-) diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index cad7c7fa7..73ab485bb 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -38,9 +38,12 @@ CALL CHECK1(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -228,7 +231,7 @@ CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF * 40 CONTINUE @@ -512,7 +515,7 @@ CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF * 40 CONTINUE diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f index 9252339d4..d48c10b7c 100644 --- a/ctest/c_cblat2.f +++ b/ctest/c_cblat2.f @@ -10,7 +10,7 @@ * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -418,7 +418,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 74293ce53..5d289aafe 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -10,7 +10,7 @@ * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -385,7 +385,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f index 9643ebc89..73fca5664 100644 --- a/ctest/c_cblat3_3m.f +++ b/ctest/c_cblat3_3m.f @@ -10,7 +10,7 @@ * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -385,7 +385,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 0139ede63..99c8b5da4 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -44,9 +44,12 @@ CALL CHECK3(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -136,7 +139,7 @@ CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - STOP + CALL ABORT END IF 20 CONTINUE 40 RETURN @@ -229,7 +232,7 @@ CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF 60 CONTINUE 80 CONTINUE @@ -384,7 +387,7 @@ CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF 100 CONTINUE 120 CONTINUE @@ -472,7 +475,7 @@ 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - STOP + CALL ABORT END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f index 0c7801d77..01a21a716 100644 --- a/ctest/c_dblat2.f +++ b/ctest/c_dblat2.f @@ -10,7 +10,7 @@ * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -414,7 +414,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f index 252fe3b71..00d16c296 100644 --- a/ctest/c_dblat3.f +++ b/ctest/c_dblat3.f @@ -10,7 +10,7 @@ * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -189,7 +189,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -232,7 +232,7 @@ SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -241,7 +241,7 @@ SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -259,7 +259,7 @@ SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -268,7 +268,7 @@ SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -379,7 +379,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 66a5def89..b88c2b783 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -44,9 +44,12 @@ CALL CHECK3(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -136,7 +139,7 @@ CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - STOP + CALL ABORT END IF 20 CONTINUE 40 RETURN @@ -229,7 +232,7 @@ CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF 60 CONTINUE 80 CONTINUE @@ -384,7 +387,7 @@ CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF 100 CONTINUE 120 CONTINUE @@ -479,7 +482,7 @@ 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - STOP + CALL ABORT END IF 40 CONTINUE 60 CONTINUE @@ -759,4 +762,4 @@ END IF END IF RETURN - END \ No newline at end of file + END diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f index 6386abe04..18d568d5d 100644 --- a/ctest/c_sblat2.f +++ b/ctest/c_sblat2.f @@ -10,7 +10,7 @@ * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -414,7 +414,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f index 4cfc1c706..bbb58d04f 100644 --- a/ctest/c_sblat3.f +++ b/ctest/c_sblat3.f @@ -10,7 +10,7 @@ * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -188,7 +188,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -231,7 +231,7 @@ SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -240,7 +240,7 @@ SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -258,7 +258,7 @@ SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -267,7 +267,7 @@ SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -378,7 +378,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index cd0c8541d..43486433e 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -38,9 +38,12 @@ CALL CHECK1(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -228,7 +231,7 @@ CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF * 40 CONTINUE @@ -512,7 +515,7 @@ CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF * 40 CONTINUE diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index cc5c1bad1..daa1a603b 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -10,7 +10,7 @@ * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -418,7 +418,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index cc109d651..83eb9e918 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -10,7 +10,7 @@ * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -386,7 +386,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f index ead64da27..d0923439e 100644 --- a/ctest/c_zblat3_3m.f +++ b/ctest/c_zblat3_3m.f @@ -10,7 +10,7 @@ * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -386,7 +386,9 @@ IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) From c9e92348a645842c47149ddd5ce3e418134470dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 19:57:17 +0200 Subject: [PATCH 04/15] Handle inf/nan if dummy2 flag is set --- kernel/power/scal_ppc440.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index d977b0b59..014e3989e 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -63,6 +63,8 @@ #endif #endif +#define FLAG r11 + #define FZERO f0 #define ALPHA f1 @@ -88,6 +90,10 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) + lwz FLAG, FRAMESLOT(0)(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) From 8a1710dd0da445d76e6eaeb35b180d24efac0919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 20:03:32 +0200 Subject: [PATCH 05/15] don't apply switch_ratio to tail of loop --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ddb39abd6..3d56c45a9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio) { + if (width < switch_ratio && width > 1) { width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); From 9783dd07ab1259cde1716a67d82c5752c3acf582 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:43:11 +0200 Subject: [PATCH 06/15] Rename KERNEL.LOONGSONGENERIC to KERNEL.LA64_GENERIC --- .../loongarch64/{KERNEL.LOONGSONGENERIC => KERNEL.LA64_GENERIC} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/loongarch64/{KERNEL.LOONGSONGENERIC => KERNEL.LA64_GENERIC} (100%) diff --git a/kernel/loongarch64/KERNEL.LOONGSONGENERIC b/kernel/loongarch64/KERNEL.LA64_GENERIC similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSONGENERIC rename to kernel/loongarch64/KERNEL.LA64_GENERIC From 9c707dc6b9845e8df25a70e67f5a07ee1ca7332b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:46:03 +0200 Subject: [PATCH 07/15] Update dynamic arch list to new target scheme --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 0ff4f1df3..27ba6f872 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -95,7 +95,7 @@ if (DYNAMIC_ARCH) endif () if (LOONGARCH64) - set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5) + set(DYNAMIC_CORE LA64_GENERIC LA264 LA464) endif () if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) From b0346e72f48515d30683d69cada17bdabe4d80ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:48:33 +0200 Subject: [PATCH 08/15] update names of loongarch64 targets for cross-compilation --- cmake/prebuild.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 785c275c7..53a78d782 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1349,7 +1349,7 @@ endif () "#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_SIZE 4096\n" "#define L2_ASSOCIATIVE 4\n") - elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC") + elseif ("${TCORE}" STREQUAL "LA64_GENERIC") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(SGEMM_UNROLL_M 2) @@ -1364,7 +1364,7 @@ endif () set(CGEMM3M_UNROLL_N 8) set(ZGEMM3M_UNROLL_M 2) set(ZGEMM3M_UNROLL_N 8) - elseif ("${TCORE}" STREQUAL "LOONGSON2K1000") + elseif ("${TCORE}" STREQUAL "LA264") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(HAVE_LSX 1) @@ -1380,7 +1380,7 @@ endif () set(CGEMM3M_UNROLL_N 8) set(ZGEMM3M_UNROLL_M 8) set(ZGEMM3M_UNROLL_N 4) - elseif ("${TCORE}" STREQUAL "LOONGSON3R5") + elseif ("${TCORE}" STREQUAL "LA464") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(HAVE_LASX 1) From 2c3b87a082984731748ae47f604b44c751d3dd83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2024 23:07:42 +0200 Subject: [PATCH 09/15] Add preliminary cpu autodetection for Zen5/5c --- cpuid_x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 9b2b7a51e..4e13f1462 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1689,6 +1689,7 @@ int get_cpuname(void){ return CPUTYPE_BARCELONA; } case 10: // Zen3/4 + case 11: // Zen5 #ifndef NO_AVX512 if(support_avx512_bf16()) return CPUTYPE_COOPERLAKE; @@ -2479,7 +2480,7 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8 || exfamily == 10) { + } else if (exfamily == 8 || exfamily == 10 || exfamily == 11) { switch (model) { case 1: // AMD Ryzen From bee123e8e3cb990197640e24aeebb1b03213ce94 Mon Sep 17 00:00:00 2001 From: NickelWenzel <55748692+NickelWenzel@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:36:40 +0200 Subject: [PATCH 10/15] fix: add missing NO_AFFINITY checks --- common_arm64.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_arm64.h b/common_arm64.h index d80b9e434..876a4aa6d 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -55,6 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#ifndef NO_AFFINITY static __inline int WhereAmI(void){ uint64_t ret; __asm__ volatile ( @@ -67,6 +68,7 @@ static __inline int WhereAmI(void){ if ((int)ret <0) ret = 0; return (int)ret; } +#endif static __inline void blas_lock(volatile BLASULONG *address){ From 0b7fb5c7915ceb119a17bb762b28f22acbb04d0e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 9 Oct 2024 09:42:23 -0500 Subject: [PATCH 11/15] CGEMM & ZGEMM using C code. --- kernel/power/KERNEL.POWER10 | 24 +- kernel/power/cgemm_kernel_power10.c | 1154 +++++++++++++++++++++++++++ kernel/power/zgemm_kernel_power10.c | 761 ++++++++++++++++++ 3 files changed, 1931 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.c create mode 100644 kernel/power/zgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c84cd91d2..4d17944ae 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c ifeq ($(OSNAME), AIX) -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c else -CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power10.S +#CTRMMKERNEL = cgemm_kernel_power10.S +#ZTRMMKERNEL = zgemm_kernel_power10.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c endif SGEMMKERNEL = sgemm_kernel_power10.c @@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c ifeq ($(OSNAME), AIX) -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power10.c else -CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMKERNEL = cgemm_kernel_power10.c endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ifeq ($(OSNAME), AIX) -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power10.c else -ZGEMMKERNEL = zgemm_kernel_power10.S +#ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMKERNEL = zgemm_kernel_power10.c endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c new file mode 100644 index 000000000..279c83aec --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.c @@ -0,0 +1,1154 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 000000000..e4e609067 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,761 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} From 9762464718183fe3d73e42c83801596111a0df77 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Oct 2024 18:06:39 +0200 Subject: [PATCH 12/15] Fix CBLAS interface filling in the wrong triangle for Row-Major --- interface/gemmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index cae00877e..01747af41 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; - if (Uplo == CblasUpper) uplo = 0; - if (Uplo == CblasLower) uplo = 1; + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; if (TransB == CblasNoTrans) transa = 0; From b89fb9632f73b9e01ec09631763bfb9b5cb65b51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 10:19:11 +0200 Subject: [PATCH 13/15] Update Android NDK install path for M1/armv7 crossbuild --- .cirrus.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 17e4eb7e8..c253c7555 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -94,16 +94,8 @@ task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - brew install --cask android-ndk - - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" - - ls /opt/homebrew - - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - - find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"" + - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: From e4bc5e4718ef355e711b002750964eb48e4d43f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 11:02:56 +0200 Subject: [PATCH 14/15] remove stray quote --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index c253c7555..112afe352 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -94,7 +94,7 @@ task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - brew install --cask android-ndk - - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"" + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: From 550bc77832282e4f03c5cfc1b7e7bbb089bc6c26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 20:39:29 +0200 Subject: [PATCH 15/15] Fix expectation values for CblasRowMajor order --- utest/test_extensions/test_cgemmt.c | 22 ++++++++++++++++++++++ utest/test_extensions/test_dgemmt.c | 15 +++++++++++++++ utest/test_extensions/test_sgemmt.c | 15 +++++++++++++++ utest/test_extensions/test_zgemmt.c | 23 ++++++++++++++++++++++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c index dfeb06ff6..92590b1e8 100644 --- a/utest/test_extensions/test_cgemmt.c +++ b/utest/test_extensions/test_cgemmt.c @@ -81,6 +81,28 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra ldc *= 2; +#ifndef NO_CBLAS + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } + } else +#endif if (uplo == 'L' || uplo == CblasLower) { for (i = 0; i < m; i++) diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c index fd8f5f666..9ce8859cc 100644 --- a/utest/test_extensions/test_dgemmt.c +++ b/utest/test_extensions/test_dgemmt.c @@ -77,6 +77,21 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra else cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); + + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } + }else #endif if (uplo == 'L' || uplo == CblasLower) diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c index 177ce0d73..f61fcf5fc 100644 --- a/utest/test_extensions/test_sgemmt.c +++ b/utest/test_extensions/test_sgemmt.c @@ -77,6 +77,21 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra else cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } + + } else #endif if (uplo == 'L' || uplo == CblasLower) diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c index 34b8b6186..dcd70e9e3 100644 --- a/utest/test_extensions/test_zgemmt.c +++ b/utest/test_extensions/test_zgemmt.c @@ -80,7 +80,28 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra #endif ldc *= 2; - +#ifndef NO_CBLAS + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } + }else +#endif if (uplo == 'L' || uplo == CblasLower) { for (i = 0; i < m; i++)