Merge remote-tracking branch 'origin/develop' into vectorizeBF16GEMV

This commit is contained in:
Chip Kerchner 2024-10-11 11:10:20 -05:00
commit c8f53b85ce
32 changed files with 2142 additions and 114 deletions

View File

@ -94,16 +94,8 @@ task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- brew install --cask android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
- ls /opt/homebrew
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:

View File

@ -95,7 +95,7 @@ if (DYNAMIC_ARCH)
endif ()
if (LOONGARCH64)
set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5)
set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
endif ()
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)

View File

@ -1349,7 +1349,7 @@ endif ()
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 4\n")
elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC")
elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
@ -1364,7 +1364,7 @@ endif ()
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 2)
set(ZGEMM3M_UNROLL_N 8)
elseif ("${TCORE}" STREQUAL "LOONGSON2K1000")
elseif ("${TCORE}" STREQUAL "LA264")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LSX 1)
@ -1380,7 +1380,7 @@ endif ()
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 8)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "LOONGSON3R5")
elseif ("${TCORE}" STREQUAL "LA464")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LASX 1)

View File

@ -55,6 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
#ifndef NO_AFFINITY
static __inline int WhereAmI(void){
uint64_t ret;
__asm__ volatile (
@ -67,6 +68,7 @@ static __inline int WhereAmI(void){
if ((int)ret <0) ret = 0;
return (int)ret;
}
#endif
static __inline void blas_lock(volatile BLASULONG *address){

View File

@ -1689,6 +1689,7 @@ int get_cpuname(void){
return CPUTYPE_BARCELONA;
}
case 10: // Zen3/4
case 11: // Zen5
#ifndef NO_AVX512
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
@ -2479,7 +2480,7 @@ int get_coretype(void){
}
break;
}
} else if (exfamily == 8 || exfamily == 10) {
} else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
switch (model) {
case 1:
// AMD Ryzen

View File

@ -38,9 +38,12 @@
CALL CHECK1(SFAC)
END IF
* -- Print
IF (PASS) WRITE (NOUT,99998)
IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE
STOP
*
99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----')
@ -228,7 +231,7 @@
CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1))
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP
CALL ABORT
END IF
*
40 CONTINUE
@ -512,7 +515,7 @@
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP
CALL ABORT
END IF
*
40 CONTINUE

View File

@ -10,7 +10,7 @@
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
$ GO TO 70
60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET
STOP
CALL ABORT
70 LTEST( I ) = LTESTT
GO TO 50
*
@ -283,7 +283,7 @@
SAME = LCE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
TRANS = 'T'
CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
SAME = LCE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -418,7 +418,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -237,7 +237,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -385,7 +385,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -237,7 +237,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -385,7 +385,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -44,9 +44,12 @@
CALL CHECK3(SFAC)
END IF
* -- Print
IF (PASS) WRITE (NOUT,99998)
IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE
STOP
*
99999 FORMAT (' Real CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----')
@ -136,7 +139,7 @@
CALL STEST1(SS,DS1(K),DS1(K),SFAC)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
STOP
CALL ABORT
END IF
20 CONTINUE
40 RETURN
@ -229,7 +232,7 @@
CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1))
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP
CALL ABORT
END IF
60 CONTINUE
80 CONTINUE
@ -384,7 +387,7 @@
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP
CALL ABORT
END IF
100 CONTINUE
120 CONTINUE
@ -472,7 +475,7 @@
70 CONTINUE
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
STOP
CALL ABORT
END IF
40 CONTINUE
60 CONTINUE

View File

@ -10,7 +10,7 @@
* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
$ GO TO 70
60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET
STOP
CALL ABORT
70 LTEST( I ) = LTESTT
GO TO 50
*
@ -279,7 +279,7 @@
SAME = LDE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
TRANS = 'T'
CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
SAME = LDE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -414,7 +414,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -189,7 +189,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -232,7 +232,7 @@
SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'T'
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -241,7 +241,7 @@
SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -259,7 +259,7 @@
SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'T'
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -268,7 +268,7 @@
SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -379,7 +379,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -44,9 +44,12 @@
CALL CHECK3(SFAC)
END IF
* -- Print
IF (PASS) WRITE (NOUT,99998)
IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE
STOP
*
99999 FORMAT (' Real CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----')
@ -136,7 +139,7 @@
CALL STEST1(SS,DS1(K),DS1(K),SFAC)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
STOP
CALL ABORT
END IF
20 CONTINUE
40 RETURN
@ -229,7 +232,7 @@
CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1))
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP
CALL ABORT
END IF
60 CONTINUE
80 CONTINUE
@ -384,7 +387,7 @@
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP
CALL ABORT
END IF
100 CONTINUE
120 CONTINUE
@ -479,7 +482,7 @@
70 CONTINUE
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
STOP
CALL ABORT
END IF
40 CONTINUE
60 CONTINUE
@ -759,4 +762,4 @@
END IF
END IF
RETURN
END
END

View File

@ -10,7 +10,7 @@
* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
$ GO TO 70
60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET
STOP
CALL ABORT
70 LTEST( I ) = LTESTT
GO TO 50
*
@ -279,7 +279,7 @@
SAME = LSE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
TRANS = 'T'
CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
SAME = LSE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -414,7 +414,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -188,7 +188,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -231,7 +231,7 @@
SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'T'
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -240,7 +240,7 @@
SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -258,7 +258,7 @@
SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'T'
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -267,7 +267,7 @@
SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -378,7 +378,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -38,9 +38,12 @@
CALL CHECK1(SFAC)
END IF
* -- Print
IF (PASS) WRITE (NOUT,99998)
IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE
STOP
*
99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----')
@ -228,7 +231,7 @@
CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1))
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP
CALL ABORT
END IF
*
40 CONTINUE
@ -512,7 +515,7 @@
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP
CALL ABORT
END IF
*
40 CONTINUE

View File

@ -10,7 +10,7 @@
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
$ GO TO 70
60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET
STOP
CALL ABORT
70 LTEST( I ) = LTESTT
GO TO 50
*
@ -283,7 +283,7 @@
SAME = LZE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
TRANS = 'T'
CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
SAME = LZE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -418,7 +418,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -238,7 +238,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -386,7 +386,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES.
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
$ GO TO 50
40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET
STOP
CALL ABORT
50 LTEST( I ) = LTESTT
GO TO 30
*
@ -238,7 +238,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP
CALL ABORT
END IF
*
* Test each subroutine in turn.
@ -386,7 +386,9 @@
IF( TRACE )
$ CLOSE ( NTRA )
CLOSE ( NOUT )
STOP
IF( FATAL ) THEN
CALL ABORT
END IF
*
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0;
while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < switch_ratio) {
if (width < switch_ratio && width > 1) {
width = switch_ratio;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);

View File

@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
lda = LDB;
ldb = LDA;
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (Uplo == CblasUpper) uplo = 1;
if (Uplo == CblasLower) uplo = 0;
if (TransB == CblasNoTrans)
transa = 0;

View File

@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c
ifeq ($(OSNAME), AIX)
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
else
CTRMMKERNEL = cgemm_kernel_power10.S
ZTRMMKERNEL = zgemm_kernel_power10.S
#CTRMMKERNEL = cgemm_kernel_power10.S
#ZTRMMKERNEL = zgemm_kernel_power10.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
endif
SGEMMKERNEL = sgemm_kernel_power10.c
@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
ifeq ($(OSNAME), AIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMKERNEL = cgemm_kernel_power10.c
else
CGEMMKERNEL = cgemm_kernel_power10.S
#CGEMMKERNEL = cgemm_kernel_power10.S
CGEMMKERNEL = cgemm_kernel_power10.c
endif
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ifeq ($(OSNAME), AIX)
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMKERNEL = zgemm_kernel_power10.c
else
ZGEMMKERNEL = zgemm_kernel_power10.S
#ZGEMMKERNEL = zgemm_kernel_power10.S
ZGEMMKERNEL = zgemm_kernel_power10.c
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c

File diff suppressed because it is too large Load Diff

View File

@ -63,6 +63,8 @@
#endif
#endif
#define FLAG r11
#define FZERO f0
#define ALPHA f1
@ -88,6 +90,10 @@
fcmpu cr0, FZERO, ALPHA
bne- cr0, LL(A1I1)
lwz FLAG, FRAMESLOT(0)(SP)
cmpwi cr0, FLAG, 1
beq- cr0, LL(A1I1)
srawi. r0, N, 4
mtspr CTR, r0
beq- cr0, LL(A0I1_Remain)

View File

@ -0,0 +1,761 @@
/*********************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#define SET_ACC_ZERO() \
__builtin_mma_xxsetaccz (&acc0); \
__builtin_mma_xxsetaccz (&acc1); \
__builtin_mma_xxsetaccz (&acc2); \
__builtin_mma_xxsetaccz (&acc3); \
__builtin_mma_xxsetaccz (&acc4); \
__builtin_mma_xxsetaccz (&acc5); \
__builtin_mma_xxsetaccz (&acc6); \
__builtin_mma_xxsetaccz (&acc7);
#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; }
#endif
#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
#endif
#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; }
#endif
#if (defined(RR) || defined(RC) || defined(CR) || defined(CC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
#endif
#if defined(TRMMKERNEL)
#define A_OP =
#else
#define A_OP +=
#endif
#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \
__builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \
__builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
__builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
__builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
__builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
__builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
#define SAVE_ACC_COMPLEX_11 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
#define SAVE_ACC_COMPLEX_12 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_1 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_2 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \
COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \
COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_21_4 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \
COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \
COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \
COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \
COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \
COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \
COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
#define SAVE_ACC_COMPLEX_22_1 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \
__builtin_mma_disassemble_acc ((void *)result, ACC1); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
#define REFRESH_TEMP_BK(x, y) \
temp = k - off;
#elif defined(LEFT)
#define REFRESH_TEMP_BK(x, y) \
temp = off + x;
#else
#define REFRESH_TEMP_BK(x, y) \
temp = off + y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_POINTERS(x, y) \
BO = B; \
REFRESH_TEMP_BK(x, y)
#else
#define REFRESH_POINTERS(x, y) \
AO += off * (2*x); \
BO = B + off * (2*y); \
REFRESH_TEMP_BK(x, y)
#endif
#ifdef LEFT
#define REFRESH_OFF(x) \
off += x;
#else
#define REFRESH_OFF(x)
#endif
#ifdef LEFT
#define UPDATE_TEMP(x, y) \
temp -= x;
#else
#define UPDATE_TEMP(x, y) \
temp -= y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_TMP_AFTER_SAVE(x, y) \
temp = k - off; \
UPDATE_TEMP(x, y) \
AO += temp * (2*x); \
BO += temp * (2*y);
#else
#define REFRESH_TMP_AFTER_SAVE(x, y)
#endif
#define REFRESH_AFTER_SAVE(x,y) \
REFRESH_TMP_AFTER_SAVE(x, y) \
REFRESH_OFF(x)
/*************************************************************************************
* GEMM Kernel
*************************************************************************************/
int
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
FLOAT * C, BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i1, i, l, temp;
FLOAT *AO, *BO, *CO;
#if defined(TRMMKERNEL)
BLASLONG off;
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
v4sf_t result[32];
FLOAT *res, tr[16], ti[16];
res = (FLOAT *) result;
for (i1 = 0; i1 < (n >> 1); i1++)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<2;
for (i = 0; i < (m >> 3); i++)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
}
__builtin_mma_disassemble_acc ((void *)result, &acc0);
__builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
__builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
__builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
__builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
__builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
__builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
__builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
AO += temp << 4;
BO += temp << 2;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 2)
#endif
}
if (m & 4)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
}
for (l = (temp & (~1)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
}
SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
AO += temp << 3;
BO += temp << 2;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
}
if (m & 2)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_22_1
AO += temp << 2;
BO += temp << 2;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 2)
#endif
}
if (m & 1)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 2)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_12
AO += temp << 1;
BO += temp << 2;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 2)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2; // number of values in A
#endif
B += k << 2;
}
if (n & 1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<1;
for (i = 0; i < (m >> 3); i++)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
__builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
}
SAVE_ACC_COMPLEX_21_4
AO += temp << 4;
BO += temp << 1;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
if (m & 4)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
}
SAVE_ACC_COMPLEX_21_2
AO += temp << 3;
BO += temp << 1;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
} if (m & 2)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_21_1
AO += temp << 2;
BO += temp << 1;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
if (m & 1)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 1)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_11
AO += temp << 1;
BO += temp << 1;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1; // number of values in A
#endif
B += k << 1;
}
return 0;
}

View File

@ -104,7 +104,7 @@
*
READ( NIN, FMT = * )SUMMRY
READ( NIN, FMT = * )NOUT
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
NOUTC = NOUT
*
* Read name and unit number for snapshot output file and open file.
@ -113,7 +113,7 @@
READ( NIN, FMT = * )NTRA
TRACE = NTRA.GE.0
IF( TRACE )THEN
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
END IF
* Read the flag that directs rewinding of the snapshot file.
READ( NIN, FMT = * )REWI
@ -3439,4 +3439,3 @@
* End of XERBLA
*
END

View File

@ -105,7 +105,7 @@
*
READ( NIN, FMT = * )SUMMRY
READ( NIN, FMT = * )NOUT
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
NOUTC = NOUT
*
* Read name and unit number for snapshot output file and open file.
@ -114,7 +114,7 @@
READ( NIN, FMT = * )NTRA
TRACE = NTRA.GE.0
IF( TRACE )THEN
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
END IF
* Read the flag that directs rewinding of the snapshot file.
READ( NIN, FMT = * )REWI

View File

@ -81,6 +81,28 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
ldc *= 2;
#ifndef NO_CBLAS
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i * 2; j < m * 2; j+=2){
data_cgemmt.c_verify[i * ldc + j] =
data_cgemmt.c_gemm[i * ldc + j];
data_cgemmt.c_verify[i * ldc + j + 1] =
data_cgemmt.c_gemm[i * ldc + j + 1];
}
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i * 2; j+=2){
data_cgemmt.c_verify[i * ldc + j] =
data_cgemmt.c_gemm[i * ldc + j];
data_cgemmt.c_verify[i * ldc + j + 1] =
data_cgemmt.c_gemm[i * ldc + j + 1];
}
}
} else
#endif
if (uplo == 'L' || uplo == CblasLower)
{
for (i = 0; i < m; i++)

View File

@ -77,6 +77,21 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
else
cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda,
data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc);
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i; j < m; j++)
data_dgemmt.c_verify[i * ldc + j] =
data_dgemmt.c_gemm[i * ldc + j];
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i; j++)
data_dgemmt.c_verify[i * ldc + j] =
data_dgemmt.c_gemm[i * ldc + j];
}
}else
#endif
if (uplo == 'L' || uplo == CblasLower)

View File

@ -77,6 +77,21 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
else
cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda,
data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc);
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i; j < m; j++)
data_sgemmt.c_verify[i * ldc + j] =
data_sgemmt.c_gemm[i * ldc + j];
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i; j++)
data_sgemmt.c_verify[i * ldc + j] =
data_sgemmt.c_gemm[i * ldc + j];
}
} else
#endif
if (uplo == 'L' || uplo == CblasLower)

View File

@ -80,7 +80,28 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
#endif
ldc *= 2;
#ifndef NO_CBLAS
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i * 2; j < m * 2; j+=2){
data_zgemmt.c_verify[i * ldc + j] =
data_zgemmt.c_gemm[i * ldc + j];
data_zgemmt.c_verify[i * ldc + j + 1] =
data_zgemmt.c_gemm[i * ldc + j + 1];
}
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i * 2; j+=2){
data_zgemmt.c_verify[i * ldc + j] =
data_zgemmt.c_gemm[i * ldc + j];
data_zgemmt.c_verify[i * ldc + j + 1] =
data_zgemmt.c_gemm[i * ldc + j + 1];
}
}
}else
#endif
if (uplo == 'L' || uplo == CblasLower)
{
for (i = 0; i < m; i++)