Merge remote-tracking branch 'origin/develop' into vectorizeBF16GEMV

This commit is contained in:
Chip Kerchner 2024-10-11 11:10:20 -05:00
commit c8f53b85ce
32 changed files with 2142 additions and 114 deletions

View File

@ -94,16 +94,8 @@ task:
name: AppleM1/LLVM armv7-androidndk xbuild name: AppleM1/LLVM armv7-androidndk xbuild
compile_script: compile_script:
- brew install --cask android-ndk - brew install --cask android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
- ls /opt/homebrew - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always: always:
config_artifacts: config_artifacts:

View File

@ -95,7 +95,7 @@ if (DYNAMIC_ARCH)
endif () endif ()
if (LOONGARCH64) if (LOONGARCH64)
set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5) set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
endif () endif ()
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)

View File

@ -1349,7 +1349,7 @@ endif ()
"#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n" "#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 4\n") "#define L2_ASSOCIATIVE 4\n")
elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC") elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n") "#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2) set(SGEMM_UNROLL_M 2)
@ -1364,7 +1364,7 @@ endif ()
set(CGEMM3M_UNROLL_N 8) set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 2) set(ZGEMM3M_UNROLL_M 2)
set(ZGEMM3M_UNROLL_N 8) set(ZGEMM3M_UNROLL_N 8)
elseif ("${TCORE}" STREQUAL "LOONGSON2K1000") elseif ("${TCORE}" STREQUAL "LA264")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n") "#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LSX 1) set(HAVE_LSX 1)
@ -1380,7 +1380,7 @@ endif ()
set(CGEMM3M_UNROLL_N 8) set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 8) set(ZGEMM3M_UNROLL_M 8)
set(ZGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "LOONGSON3R5") elseif ("${TCORE}" STREQUAL "LA464")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n") "#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LASX 1) set(HAVE_LASX 1)

View File

@ -55,6 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER #ifndef ASSEMBLER
#ifndef NO_AFFINITY
static __inline int WhereAmI(void){ static __inline int WhereAmI(void){
uint64_t ret; uint64_t ret;
__asm__ volatile ( __asm__ volatile (
@ -67,6 +68,7 @@ static __inline int WhereAmI(void){
if ((int)ret <0) ret = 0; if ((int)ret <0) ret = 0;
return (int)ret; return (int)ret;
} }
#endif
static __inline void blas_lock(volatile BLASULONG *address){ static __inline void blas_lock(volatile BLASULONG *address){

View File

@ -1689,6 +1689,7 @@ int get_cpuname(void){
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
} }
case 10: // Zen3/4 case 10: // Zen3/4
case 11: // Zen5
#ifndef NO_AVX512 #ifndef NO_AVX512
if(support_avx512_bf16()) if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE; return CPUTYPE_COOPERLAKE;
@ -2479,7 +2480,7 @@ int get_coretype(void){
} }
break; break;
} }
} else if (exfamily == 8 || exfamily == 10) { } else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen

View File

@ -38,9 +38,12 @@
CALL CHECK1(SFAC) CALL CHECK1(SFAC)
END IF END IF
* -- Print * -- Print
IF (PASS) WRITE (NOUT,99998) IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE 20 CONTINUE
STOP
* *
99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----') 99998 FORMAT (' ----- PASS -----')
@ -228,7 +231,7 @@
CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1))
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP CALL ABORT
END IF END IF
* *
40 CONTINUE 40 CONTINUE
@ -512,7 +515,7 @@
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP CALL ABORT
END IF END IF
* *
40 CONTINUE 40 CONTINUE

View File

@ -10,7 +10,7 @@
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
$ GO TO 70 $ GO TO 70
60 CONTINUE 60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET WRITE( NOUT, FMT = 9986 )SNAMET
STOP CALL ABORT
70 LTEST( I ) = LTESTT 70 LTEST( I ) = LTESTT
GO TO 50 GO TO 50
* *
@ -283,7 +283,7 @@
SAME = LCE( YY, YT, N ) SAME = LCE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANS = 'T' TRANS = 'T'
CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
SAME = LCE( YY, YT, N ) SAME = LCE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -418,7 +418,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -237,7 +237,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -385,7 +385,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -194,7 +194,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -237,7 +237,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -246,7 +246,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -264,7 +264,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -273,7 +273,7 @@
SAME = LCE( CC, CT, N ) SAME = LCE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -385,7 +385,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -44,9 +44,12 @@
CALL CHECK3(SFAC) CALL CHECK3(SFAC)
END IF END IF
* -- Print * -- Print
IF (PASS) WRITE (NOUT,99998) IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE 20 CONTINUE
STOP
* *
99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----') 99998 FORMAT (' ----- PASS -----')
@ -136,7 +139,7 @@
CALL STEST1(SS,DS1(K),DS1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
STOP CALL ABORT
END IF END IF
20 CONTINUE 20 CONTINUE
40 RETURN 40 RETURN
@ -229,7 +232,7 @@
CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1))
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP CALL ABORT
END IF END IF
60 CONTINUE 60 CONTINUE
80 CONTINUE 80 CONTINUE
@ -384,7 +387,7 @@
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP CALL ABORT
END IF END IF
100 CONTINUE 100 CONTINUE
120 CONTINUE 120 CONTINUE
@ -472,7 +475,7 @@
70 CONTINUE 70 CONTINUE
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
STOP CALL ABORT
END IF END IF
40 CONTINUE 40 CONTINUE
60 CONTINUE 60 CONTINUE

View File

@ -10,7 +10,7 @@
* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
$ GO TO 70 $ GO TO 70
60 CONTINUE 60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET WRITE( NOUT, FMT = 9986 )SNAMET
STOP CALL ABORT
70 LTEST( I ) = LTESTT 70 LTEST( I ) = LTESTT
GO TO 50 GO TO 50
* *
@ -279,7 +279,7 @@
SAME = LDE( YY, YT, N ) SAME = LDE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANS = 'T' TRANS = 'T'
CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
SAME = LDE( YY, YT, N ) SAME = LDE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -414,7 +414,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -189,7 +189,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -232,7 +232,7 @@
SAME = LDE( CC, CT, N ) SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'T' TRANSB = 'T'
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -241,7 +241,7 @@
SAME = LDE( CC, CT, N ) SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -259,7 +259,7 @@
SAME = LDE( CC, CT, N ) SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'T' TRANSB = 'T'
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -268,7 +268,7 @@
SAME = LDE( CC, CT, N ) SAME = LDE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -379,7 +379,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -44,9 +44,12 @@
CALL CHECK3(SFAC) CALL CHECK3(SFAC)
END IF END IF
* -- Print * -- Print
IF (PASS) WRITE (NOUT,99998) IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE 20 CONTINUE
STOP
* *
99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----') 99998 FORMAT (' ----- PASS -----')
@ -136,7 +139,7 @@
CALL STEST1(SS,DS1(K),DS1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
STOP CALL ABORT
END IF END IF
20 CONTINUE 20 CONTINUE
40 RETURN 40 RETURN
@ -229,7 +232,7 @@
CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1))
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP CALL ABORT
END IF END IF
60 CONTINUE 60 CONTINUE
80 CONTINUE 80 CONTINUE
@ -384,7 +387,7 @@
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP CALL ABORT
END IF END IF
100 CONTINUE 100 CONTINUE
120 CONTINUE 120 CONTINUE
@ -479,7 +482,7 @@
70 CONTINUE 70 CONTINUE
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
STOP CALL ABORT
END IF END IF
40 CONTINUE 40 CONTINUE
60 CONTINUE 60 CONTINUE

View File

@ -10,7 +10,7 @@
* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -239,7 +239,7 @@
$ GO TO 70 $ GO TO 70
60 CONTINUE 60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET WRITE( NOUT, FMT = 9986 )SNAMET
STOP CALL ABORT
70 LTEST( I ) = LTESTT 70 LTEST( I ) = LTESTT
GO TO 50 GO TO 50
* *
@ -279,7 +279,7 @@
SAME = LSE( YY, YT, N ) SAME = LSE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANS = 'T' TRANS = 'T'
CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -287,7 +287,7 @@
SAME = LSE( YY, YT, N ) SAME = LSE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -414,7 +414,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -188,7 +188,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -231,7 +231,7 @@
SAME = LSE( CC, CT, N ) SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'T' TRANSB = 'T'
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -240,7 +240,7 @@
SAME = LSE( CC, CT, N ) SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -258,7 +258,7 @@
SAME = LSE( CC, CT, N ) SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'T' TRANSB = 'T'
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -267,7 +267,7 @@
SAME = LSE( CC, CT, N ) SAME = LSE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -378,7 +378,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -38,9 +38,12 @@
CALL CHECK1(SFAC) CALL CHECK1(SFAC)
END IF END IF
* -- Print * -- Print
IF (PASS) WRITE (NOUT,99998) IF (PASS) THEN
WRITE (NOUT,99998)
ELSE
CALL ABORT
END IF
20 CONTINUE 20 CONTINUE
STOP
* *
99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
99998 FORMAT (' ----- PASS -----') 99998 FORMAT (' ----- PASS -----')
@ -228,7 +231,7 @@
CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1))
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
STOP CALL ABORT
END IF END IF
* *
40 CONTINUE 40 CONTINUE
@ -512,7 +515,7 @@
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
ELSE ELSE
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
STOP CALL ABORT
END IF END IF
* *
40 CONTINUE 40 CONTINUE

View File

@ -10,7 +10,7 @@
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -243,7 +243,7 @@
$ GO TO 70 $ GO TO 70
60 CONTINUE 60 CONTINUE
WRITE( NOUT, FMT = 9986 )SNAMET WRITE( NOUT, FMT = 9986 )SNAMET
STOP CALL ABORT
70 LTEST( I ) = LTESTT 70 LTEST( I ) = LTESTT
GO TO 50 GO TO 50
* *
@ -283,7 +283,7 @@
SAME = LZE( YY, YT, N ) SAME = LZE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANS = 'T' TRANS = 'T'
CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@ -291,7 +291,7 @@
SAME = LZE( YY, YT, N ) SAME = LZE( YY, YT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -418,7 +418,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -238,7 +238,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -386,7 +386,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -10,7 +10,7 @@
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
* F LOGICAL FLAG, T TO STOP ON FAILURES. * F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
* T LOGICAL FLAG, T TO TEST ERROR EXITS. * T LOGICAL FLAG, T TO TEST ERROR EXITS.
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
* 16.0 THRESHOLD VALUE OF TEST RATIO * 16.0 THRESHOLD VALUE OF TEST RATIO
@ -195,7 +195,7 @@
$ GO TO 50 $ GO TO 50
40 CONTINUE 40 CONTINUE
WRITE( NOUT, FMT = 9990 )SNAMET WRITE( NOUT, FMT = 9990 )SNAMET
STOP CALL ABORT
50 LTEST( I ) = LTESTT 50 LTEST( I ) = LTESTT
GO TO 30 GO TO 30
* *
@ -238,7 +238,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -247,7 +247,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
DO 120 J = 1, N DO 120 J = 1, N
AB( J, NMAX + 1 ) = N - J + 1 AB( J, NMAX + 1 ) = N - J + 1
@ -265,7 +265,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
TRANSB = 'C' TRANSB = 'C'
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@ -274,7 +274,7 @@
SAME = LZE( CC, CT, N ) SAME = LZE( CC, CT, N )
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
STOP CALL ABORT
END IF END IF
* *
* Test each subroutine in turn. * Test each subroutine in turn.
@ -386,7 +386,9 @@
IF( TRACE ) IF( TRACE )
$ CLOSE ( NTRA ) $ CLOSE ( NTRA )
CLOSE ( NOUT ) CLOSE ( NOUT )
STOP IF( FATAL ) THEN
CALL ABORT
END IF
* *
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )

View File

@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0; num_parts = 0;
while (n > 0){ while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < switch_ratio) { if (width < switch_ratio && width > 1) {
width = switch_ratio; width = switch_ratio;
} }
width = round_up(n, width, GEMM_PREFERED_SIZE); width = round_up(n, width, GEMM_PREFERED_SIZE);

View File

@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
lda = LDB; lda = LDB;
ldb = LDA; ldb = LDA;
if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasUpper) uplo = 1;
if (Uplo == CblasLower) uplo = 1; if (Uplo == CblasLower) uplo = 0;
if (TransB == CblasNoTrans) if (TransB == CblasNoTrans)
transa = 0; transa = 0;

View File

@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = sgemm_kernel_power10.c STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S #CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S #ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
else else
CTRMMKERNEL = cgemm_kernel_power10.S #CTRMMKERNEL = cgemm_kernel_power10.S
ZTRMMKERNEL = zgemm_kernel_power10.S #ZTRMMKERNEL = zgemm_kernel_power10.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
endif endif
SGEMMKERNEL = sgemm_kernel_power10.c SGEMMKERNEL = sgemm_kernel_power10.c
@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S #CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMKERNEL = cgemm_kernel_power10.c
else else
CGEMMKERNEL = cgemm_kernel_power10.S #CGEMMKERNEL = cgemm_kernel_power10.S
CGEMMKERNEL = cgemm_kernel_power10.c
endif endif
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S #CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S #ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMKERNEL = zgemm_kernel_power10.c
else else
ZGEMMKERNEL = zgemm_kernel_power10.S #ZGEMMKERNEL = zgemm_kernel_power10.S
ZGEMMKERNEL = zgemm_kernel_power10.c
endif endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c

File diff suppressed because it is too large Load Diff

View File

@ -63,6 +63,8 @@
#endif #endif
#endif #endif
#define FLAG r11
#define FZERO f0 #define FZERO f0
#define ALPHA f1 #define ALPHA f1
@ -88,6 +90,10 @@
fcmpu cr0, FZERO, ALPHA fcmpu cr0, FZERO, ALPHA
bne- cr0, LL(A1I1) bne- cr0, LL(A1I1)
lwz FLAG, FRAMESLOT(0)(SP)
cmpwi cr0, FLAG, 1
beq- cr0, LL(A1I1)
srawi. r0, N, 4 srawi. r0, N, 4
mtspr CTR, r0 mtspr CTR, r0
beq- cr0, LL(A0I1_Remain) beq- cr0, LL(A0I1_Remain)

View File

@ -0,0 +1,761 @@
/*********************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
#define SET_ACC_ZERO() \
__builtin_mma_xxsetaccz (&acc0); \
__builtin_mma_xxsetaccz (&acc1); \
__builtin_mma_xxsetaccz (&acc2); \
__builtin_mma_xxsetaccz (&acc3); \
__builtin_mma_xxsetaccz (&acc4); \
__builtin_mma_xxsetaccz (&acc5); \
__builtin_mma_xxsetaccz (&acc6); \
__builtin_mma_xxsetaccz (&acc7);
#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; }
#endif
#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
#endif
#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; }
#endif
#if (defined(RR) || defined(RC) || defined(CR) || defined(CC))
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; }
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
#endif
#if defined(TRMMKERNEL)
#define A_OP =
#else
#define A_OP +=
#endif
#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \
__builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \
__builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
__builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
__builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
__builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
__builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
#define SAVE_ACC_COMPLEX_11 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
#define SAVE_ACC_COMPLEX_12 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_1 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
#define SAVE_ACC_COMPLEX_21_2 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \
COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \
COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_21_4 \
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \
COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \
COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \
COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \
COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \
COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \
COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
#define SAVE_ACC_COMPLEX_22_1 \
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \
__builtin_mma_disassemble_acc ((void *)result, ACC1); \
__builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
#define REFRESH_TEMP_BK(x, y) \
temp = k - off;
#elif defined(LEFT)
#define REFRESH_TEMP_BK(x, y) \
temp = off + x;
#else
#define REFRESH_TEMP_BK(x, y) \
temp = off + y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_POINTERS(x, y) \
BO = B; \
REFRESH_TEMP_BK(x, y)
#else
#define REFRESH_POINTERS(x, y) \
AO += off * (2*x); \
BO = B + off * (2*y); \
REFRESH_TEMP_BK(x, y)
#endif
#ifdef LEFT
#define REFRESH_OFF(x) \
off += x;
#else
#define REFRESH_OFF(x)
#endif
#ifdef LEFT
#define UPDATE_TEMP(x, y) \
temp -= x;
#else
#define UPDATE_TEMP(x, y) \
temp -= y;
#endif
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
#define REFRESH_TMP_AFTER_SAVE(x, y) \
temp = k - off; \
UPDATE_TEMP(x, y) \
AO += temp * (2*x); \
BO += temp * (2*y);
#else
#define REFRESH_TMP_AFTER_SAVE(x, y)
#endif
#define REFRESH_AFTER_SAVE(x,y) \
REFRESH_TMP_AFTER_SAVE(x, y) \
REFRESH_OFF(x)
/*************************************************************************************
* GEMM Kernel
*************************************************************************************/
int
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
FLOAT * C, BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i1, i, l, temp;
FLOAT *AO, *BO, *CO;
#if defined(TRMMKERNEL)
BLASLONG off;
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
v4sf_t result[32];
FLOAT *res, tr[16], ti[16];
res = (FLOAT *) result;
for (i1 = 0; i1 < (n >> 1); i1++)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<2;
for (i = 0; i < (m >> 3); i++)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
}
__builtin_mma_disassemble_acc ((void *)result, &acc0);
__builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
__builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
__builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
__builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
__builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
__builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
__builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
AO += temp << 4;
BO += temp << 2;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 2)
#endif
}
if (m & 4)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
}
for (l = (temp & (~1)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
}
SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
AO += temp << 3;
BO += temp << 2;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
}
if (m & 2)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 2)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_22_1
AO += temp << 2;
BO += temp << 2;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 2)
#endif
}
if (m & 1)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 2)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
}
SAVE_ACC_COMPLEX_12
AO += temp << 1;
BO += temp << 2;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 2)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2; // number of values in A
#endif
B += k << 2;
}
if (n & 1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<1;
for (i = 0; i < (m >> 3); i++)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
__builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
__builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
}
SAVE_ACC_COMPLEX_21_4
AO += temp << 4;
BO += temp << 1;
CO += 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
if (m & 4)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
}
SAVE_ACC_COMPLEX_21_2
AO += temp << 3;
BO += temp << 1;
CO += 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
} if (m & 2)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 1)
#else
BO = B;
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_21_1
AO += temp << 2;
BO += temp << 1;
CO += 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
if (m & 1)
{
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 1)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
}
SAVE_ACC_COMPLEX_11
AO += temp << 1;
BO += temp << 1;
CO += 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1; // number of values in A
#endif
B += k << 1;
}
return 0;
}

View File

@ -104,7 +104,7 @@
* *
READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )SUMMRY
READ( NIN, FMT = * )NOUT READ( NIN, FMT = * )NOUT
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
NOUTC = NOUT NOUTC = NOUT
* *
* Read name and unit number for snapshot output file and open file. * Read name and unit number for snapshot output file and open file.
@ -113,7 +113,7 @@
READ( NIN, FMT = * )NTRA READ( NIN, FMT = * )NTRA
TRACE = NTRA.GE.0 TRACE = NTRA.GE.0
IF( TRACE )THEN IF( TRACE )THEN
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
END IF END IF
* Read the flag that directs rewinding of the snapshot file. * Read the flag that directs rewinding of the snapshot file.
READ( NIN, FMT = * )REWI READ( NIN, FMT = * )REWI
@ -3439,4 +3439,3 @@
* End of XERBLA * End of XERBLA
* *
END END

View File

@ -105,7 +105,7 @@
* *
READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )SUMMRY
READ( NIN, FMT = * )NOUT READ( NIN, FMT = * )NOUT
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
NOUTC = NOUT NOUTC = NOUT
* *
* Read name and unit number for snapshot output file and open file. * Read name and unit number for snapshot output file and open file.
@ -114,7 +114,7 @@
READ( NIN, FMT = * )NTRA READ( NIN, FMT = * )NTRA
TRACE = NTRA.GE.0 TRACE = NTRA.GE.0
IF( TRACE )THEN IF( TRACE )THEN
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
END IF END IF
* Read the flag that directs rewinding of the snapshot file. * Read the flag that directs rewinding of the snapshot file.
READ( NIN, FMT = * )REWI READ( NIN, FMT = * )REWI

View File

@ -81,6 +81,28 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
ldc *= 2; ldc *= 2;
#ifndef NO_CBLAS
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i * 2; j < m * 2; j+=2){
data_cgemmt.c_verify[i * ldc + j] =
data_cgemmt.c_gemm[i * ldc + j];
data_cgemmt.c_verify[i * ldc + j + 1] =
data_cgemmt.c_gemm[i * ldc + j + 1];
}
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i * 2; j+=2){
data_cgemmt.c_verify[i * ldc + j] =
data_cgemmt.c_gemm[i * ldc + j];
data_cgemmt.c_verify[i * ldc + j + 1] =
data_cgemmt.c_gemm[i * ldc + j + 1];
}
}
} else
#endif
if (uplo == 'L' || uplo == CblasLower) if (uplo == 'L' || uplo == CblasLower)
{ {
for (i = 0; i < m; i++) for (i = 0; i < m; i++)

View File

@ -77,6 +77,21 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
else else
cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda,
data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc);
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i; j < m; j++)
data_dgemmt.c_verify[i * ldc + j] =
data_dgemmt.c_gemm[i * ldc + j];
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i; j++)
data_dgemmt.c_verify[i * ldc + j] =
data_dgemmt.c_gemm[i * ldc + j];
}
}else
#endif #endif
if (uplo == 'L' || uplo == CblasLower) if (uplo == 'L' || uplo == CblasLower)

View File

@ -77,6 +77,21 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
else else
cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda,
data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc);
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i; j < m; j++)
data_sgemmt.c_verify[i * ldc + j] =
data_sgemmt.c_gemm[i * ldc + j];
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i; j++)
data_sgemmt.c_verify[i * ldc + j] =
data_sgemmt.c_gemm[i * ldc + j];
}
} else
#endif #endif
if (uplo == 'L' || uplo == CblasLower) if (uplo == 'L' || uplo == CblasLower)

View File

@ -80,7 +80,28 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
#endif #endif
ldc *= 2; ldc *= 2;
#ifndef NO_CBLAS
if (order == CblasRowMajor) {
if (uplo == 'U' || uplo == CblasUpper)
{
for (i = 0; i < m; i++)
for (j = i * 2; j < m * 2; j+=2){
data_zgemmt.c_verify[i * ldc + j] =
data_zgemmt.c_gemm[i * ldc + j];
data_zgemmt.c_verify[i * ldc + j + 1] =
data_zgemmt.c_gemm[i * ldc + j + 1];
}
} else {
for (i = 0; i < m; i++)
for (j = 0; j <= i * 2; j+=2){
data_zgemmt.c_verify[i * ldc + j] =
data_zgemmt.c_gemm[i * ldc + j];
data_zgemmt.c_verify[i * ldc + j + 1] =
data_zgemmt.c_gemm[i * ldc + j + 1];
}
}
}else
#endif
if (uplo == 'L' || uplo == CblasLower) if (uplo == 'L' || uplo == CblasLower)
{ {
for (i = 0; i < m; i++) for (i = 0; i < m; i++)