Merge remote-tracking branch 'origin/develop' into vectorizeBF16GEMV
This commit is contained in:
commit
c8f53b85ce
10
.cirrus.yml
10
.cirrus.yml
|
@ -94,16 +94,8 @@ task:
|
|||
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||
compile_script:
|
||||
- brew install --cask android-ndk
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
|
||||
- ls /opt/homebrew
|
||||
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
|
||||
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
|
|
|
@ -95,7 +95,7 @@ if (DYNAMIC_ARCH)
|
|||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5)
|
||||
set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
|
||||
endif ()
|
||||
|
||||
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
|
||||
|
|
|
@ -1349,7 +1349,7 @@ endif ()
|
|||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 4\n")
|
||||
elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC")
|
||||
elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
|
@ -1364,7 +1364,7 @@ endif ()
|
|||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 2)
|
||||
set(ZGEMM3M_UNROLL_N 8)
|
||||
elseif ("${TCORE}" STREQUAL "LOONGSON2K1000")
|
||||
elseif ("${TCORE}" STREQUAL "LA264")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(HAVE_LSX 1)
|
||||
|
@ -1380,7 +1380,7 @@ endif ()
|
|||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 8)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "LOONGSON3R5")
|
||||
elseif ("${TCORE}" STREQUAL "LA464")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(HAVE_LASX 1)
|
||||
|
|
|
@ -55,6 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
static __inline int WhereAmI(void){
|
||||
uint64_t ret;
|
||||
__asm__ volatile (
|
||||
|
@ -67,6 +68,7 @@ static __inline int WhereAmI(void){
|
|||
if ((int)ret <0) ret = 0;
|
||||
return (int)ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
|
|
@ -1689,6 +1689,7 @@ int get_cpuname(void){
|
|||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
case 10: // Zen3/4
|
||||
case 11: // Zen5
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
|
@ -2479,7 +2480,7 @@ int get_coretype(void){
|
|||
}
|
||||
break;
|
||||
}
|
||||
} else if (exfamily == 8 || exfamily == 10) {
|
||||
} else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
|
|
|
@ -38,9 +38,12 @@
|
|||
CALL CHECK1(SFAC)
|
||||
END IF
|
||||
* -- Print
|
||||
IF (PASS) WRITE (NOUT,99998)
|
||||
IF (PASS) THEN
|
||||
WRITE (NOUT,99998)
|
||||
ELSE
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
STOP
|
||||
*
|
||||
99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
|
||||
99998 FORMAT (' ----- PASS -----')
|
||||
|
@ -228,7 +231,7 @@
|
|||
CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1))
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
40 CONTINUE
|
||||
|
@ -512,7 +515,7 @@
|
|||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
40 CONTINUE
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -243,7 +243,7 @@
|
|||
$ GO TO 70
|
||||
60 CONTINUE
|
||||
WRITE( NOUT, FMT = 9986 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
70 LTEST( I ) = LTESTT
|
||||
GO TO 50
|
||||
*
|
||||
|
@ -283,7 +283,7 @@
|
|||
SAME = LCE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANS = 'T'
|
||||
CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
|
||||
|
@ -291,7 +291,7 @@
|
|||
SAME = LCE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -418,7 +418,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -194,7 +194,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -237,7 +237,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -246,7 +246,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -264,7 +264,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -273,7 +273,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -385,7 +385,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -194,7 +194,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -237,7 +237,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -246,7 +246,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -264,7 +264,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -273,7 +273,7 @@
|
|||
SAME = LCE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -385,7 +385,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -44,9 +44,12 @@
|
|||
CALL CHECK3(SFAC)
|
||||
END IF
|
||||
* -- Print
|
||||
IF (PASS) WRITE (NOUT,99998)
|
||||
IF (PASS) THEN
|
||||
WRITE (NOUT,99998)
|
||||
ELSE
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
STOP
|
||||
*
|
||||
99999 FORMAT (' Real CBLAS Test Program Results',/1X)
|
||||
99998 FORMAT (' ----- PASS -----')
|
||||
|
@ -136,7 +139,7 @@
|
|||
CALL STEST1(SS,DS1(K),DS1(K),SFAC)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
40 RETURN
|
||||
|
@ -229,7 +232,7 @@
|
|||
CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1))
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
60 CONTINUE
|
||||
80 CONTINUE
|
||||
|
@ -384,7 +387,7 @@
|
|||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
100 CONTINUE
|
||||
120 CONTINUE
|
||||
|
@ -472,7 +475,7 @@
|
|||
70 CONTINUE
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
40 CONTINUE
|
||||
60 CONTINUE
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -239,7 +239,7 @@
|
|||
$ GO TO 70
|
||||
60 CONTINUE
|
||||
WRITE( NOUT, FMT = 9986 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
70 LTEST( I ) = LTESTT
|
||||
GO TO 50
|
||||
*
|
||||
|
@ -279,7 +279,7 @@
|
|||
SAME = LDE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANS = 'T'
|
||||
CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
|
||||
|
@ -287,7 +287,7 @@
|
|||
SAME = LDE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -414,7 +414,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -189,7 +189,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -232,7 +232,7 @@
|
|||
SAME = LDE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'T'
|
||||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -241,7 +241,7 @@
|
|||
SAME = LDE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -259,7 +259,7 @@
|
|||
SAME = LDE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'T'
|
||||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -268,7 +268,7 @@
|
|||
SAME = LDE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -379,7 +379,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -44,9 +44,12 @@
|
|||
CALL CHECK3(SFAC)
|
||||
END IF
|
||||
* -- Print
|
||||
IF (PASS) WRITE (NOUT,99998)
|
||||
IF (PASS) THEN
|
||||
WRITE (NOUT,99998)
|
||||
ELSE
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
STOP
|
||||
*
|
||||
99999 FORMAT (' Real CBLAS Test Program Results',/1X)
|
||||
99998 FORMAT (' ----- PASS -----')
|
||||
|
@ -136,7 +139,7 @@
|
|||
CALL STEST1(SS,DS1(K),DS1(K),SFAC)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
40 RETURN
|
||||
|
@ -229,7 +232,7 @@
|
|||
CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1))
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
60 CONTINUE
|
||||
80 CONTINUE
|
||||
|
@ -384,7 +387,7 @@
|
|||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
100 CONTINUE
|
||||
120 CONTINUE
|
||||
|
@ -479,7 +482,7 @@
|
|||
70 CONTINUE
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
40 CONTINUE
|
||||
60 CONTINUE
|
||||
|
@ -759,4 +762,4 @@
|
|||
END IF
|
||||
END IF
|
||||
RETURN
|
||||
END
|
||||
END
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -239,7 +239,7 @@
|
|||
$ GO TO 70
|
||||
60 CONTINUE
|
||||
WRITE( NOUT, FMT = 9986 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
70 LTEST( I ) = LTESTT
|
||||
GO TO 50
|
||||
*
|
||||
|
@ -279,7 +279,7 @@
|
|||
SAME = LSE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANS = 'T'
|
||||
CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
|
||||
|
@ -287,7 +287,7 @@
|
|||
SAME = LSE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -414,7 +414,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -188,7 +188,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -231,7 +231,7 @@
|
|||
SAME = LSE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'T'
|
||||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -240,7 +240,7 @@
|
|||
SAME = LSE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -258,7 +258,7 @@
|
|||
SAME = LSE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'T'
|
||||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -267,7 +267,7 @@
|
|||
SAME = LSE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -378,7 +378,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -38,9 +38,12 @@
|
|||
CALL CHECK1(SFAC)
|
||||
END IF
|
||||
* -- Print
|
||||
IF (PASS) WRITE (NOUT,99998)
|
||||
IF (PASS) THEN
|
||||
WRITE (NOUT,99998)
|
||||
ELSE
|
||||
CALL ABORT
|
||||
END IF
|
||||
20 CONTINUE
|
||||
STOP
|
||||
*
|
||||
99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
|
||||
99998 FORMAT (' ----- PASS -----')
|
||||
|
@ -228,7 +231,7 @@
|
|||
CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1))
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
40 CONTINUE
|
||||
|
@ -512,7 +515,7 @@
|
|||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
|
||||
ELSE
|
||||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
40 CONTINUE
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -243,7 +243,7 @@
|
|||
$ GO TO 70
|
||||
60 CONTINUE
|
||||
WRITE( NOUT, FMT = 9986 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
70 LTEST( I ) = LTESTT
|
||||
GO TO 50
|
||||
*
|
||||
|
@ -283,7 +283,7 @@
|
|||
SAME = LZE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANS = 'T'
|
||||
CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
|
||||
|
@ -291,7 +291,7 @@
|
|||
SAME = LZE( YY, YT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -418,7 +418,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -195,7 +195,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -238,7 +238,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -247,7 +247,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -265,7 +265,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -274,7 +274,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -386,7 +386,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
* F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES.
|
||||
* T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
* 16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
@ -195,7 +195,7 @@
|
|||
$ GO TO 50
|
||||
40 CONTINUE
|
||||
WRITE( NOUT, FMT = 9990 )SNAMET
|
||||
STOP
|
||||
CALL ABORT
|
||||
50 LTEST( I ) = LTESTT
|
||||
GO TO 30
|
||||
*
|
||||
|
@ -238,7 +238,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -247,7 +247,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
DO 120 J = 1, N
|
||||
AB( J, NMAX + 1 ) = N - J + 1
|
||||
|
@ -265,7 +265,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
TRANSB = 'C'
|
||||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
|
||||
|
@ -274,7 +274,7 @@
|
|||
SAME = LZE( CC, CT, N )
|
||||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
|
||||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
|
||||
STOP
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
* Test each subroutine in turn.
|
||||
|
@ -386,7 +386,9 @@
|
|||
IF( TRACE )
|
||||
$ CLOSE ( NTRA )
|
||||
CLOSE ( NOUT )
|
||||
STOP
|
||||
IF( FATAL ) THEN
|
||||
CALL ABORT
|
||||
END IF
|
||||
*
|
||||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
|
||||
10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
|
||||
|
|
|
@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
num_parts = 0;
|
||||
while (n > 0){
|
||||
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
|
||||
if (width < switch_ratio) {
|
||||
if (width < switch_ratio && width > 1) {
|
||||
width = switch_ratio;
|
||||
}
|
||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||
|
|
|
@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
lda = LDB;
|
||||
ldb = LDA;
|
||||
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
if (Uplo == CblasUpper) uplo = 1;
|
||||
if (Uplo == CblasLower) uplo = 0;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transa = 0;
|
||||
|
|
|
@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
STRMMKERNEL = sgemm_kernel_power10.c
|
||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||
ifeq ($(OSNAME), AIX)
|
||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||
else
|
||||
CTRMMKERNEL = cgemm_kernel_power10.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||
#CTRMMKERNEL = cgemm_kernel_power10.S
|
||||
#ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||
endif
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||
|
@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
|||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||
else
|
||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
#CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||
endif
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
|
@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||
else
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||
#ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -63,6 +63,8 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#define FLAG r11
|
||||
|
||||
#define FZERO f0
|
||||
#define ALPHA f1
|
||||
|
||||
|
@ -88,6 +90,10 @@
|
|||
fcmpu cr0, FZERO, ALPHA
|
||||
bne- cr0, LL(A1I1)
|
||||
|
||||
lwz FLAG, FRAMESLOT(0)(SP)
|
||||
cmpwi cr0, FLAG, 1
|
||||
beq- cr0, LL(A1I1)
|
||||
|
||||
srawi. r0, N, 4
|
||||
mtspr CTR, r0
|
||||
beq- cr0, LL(A0I1_Remain)
|
||||
|
|
|
@ -0,0 +1,761 @@
|
|||
/*********************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
|
||||
#define SET_ACC_ZERO() \
|
||||
__builtin_mma_xxsetaccz (&acc0); \
|
||||
__builtin_mma_xxsetaccz (&acc1); \
|
||||
__builtin_mma_xxsetaccz (&acc2); \
|
||||
__builtin_mma_xxsetaccz (&acc3); \
|
||||
__builtin_mma_xxsetaccz (&acc4); \
|
||||
__builtin_mma_xxsetaccz (&acc5); \
|
||||
__builtin_mma_xxsetaccz (&acc6); \
|
||||
__builtin_mma_xxsetaccz (&acc7);
|
||||
|
||||
#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
|
||||
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; }
|
||||
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; }
|
||||
#endif
|
||||
|
||||
#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
|
||||
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; }
|
||||
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
|
||||
#endif
|
||||
|
||||
#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
|
||||
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; }
|
||||
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; }
|
||||
#endif
|
||||
|
||||
#if (defined(RR) || defined(RC) || defined(CR) || defined(CC))
|
||||
#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; }
|
||||
#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#define A_OP =
|
||||
#else
|
||||
#define A_OP +=
|
||||
#endif
|
||||
|
||||
#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
|
||||
__builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
|
||||
|
||||
#define SAVE_ACC_COMPLEX_11 \
|
||||
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
|
||||
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
|
||||
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
|
||||
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
|
||||
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_12 \
|
||||
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||
COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \
|
||||
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||
COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \
|
||||
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||
COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \
|
||||
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||
COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \
|
||||
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_21_1 \
|
||||
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||
COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \
|
||||
COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \
|
||||
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
|
||||
COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \
|
||||
COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \
|
||||
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||
COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \
|
||||
COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \
|
||||
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
|
||||
COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \
|
||||
COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \
|
||||
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_21_2 \
|
||||
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
|
||||
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||
COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \
|
||||
COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \
|
||||
COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \
|
||||
COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \
|
||||
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
|
||||
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
|
||||
COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \
|
||||
COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \
|
||||
COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \
|
||||
COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \
|
||||
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||
CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||
CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||
CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||
CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_21_4 \
|
||||
BUILTIN_MMA_DISASSEMBLE_ACC_8 \
|
||||
COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \
|
||||
COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \
|
||||
COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \
|
||||
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||
COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \
|
||||
COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \
|
||||
COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \
|
||||
COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \
|
||||
COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \
|
||||
COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \
|
||||
COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \
|
||||
COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \
|
||||
COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \
|
||||
COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \
|
||||
COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \
|
||||
COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \
|
||||
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \
|
||||
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \
|
||||
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \
|
||||
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \
|
||||
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \
|
||||
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \
|
||||
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \
|
||||
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \
|
||||
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_22_1 \
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0); \
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
|
||||
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
|
||||
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
|
||||
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
|
||||
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \
|
||||
CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||
CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||
CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||
CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||
CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||
|
||||
#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC1); \
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
|
||||
COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \
|
||||
COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \
|
||||
COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \
|
||||
COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \
|
||||
CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \
|
||||
CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \
|
||||
CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \
|
||||
CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \
|
||||
CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \
|
||||
CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \
|
||||
CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \
|
||||
CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||
|
||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = k - off;
|
||||
#elif defined(LEFT)
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = off + x;
|
||||
#else
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = off + y;
|
||||
#endif
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
#define REFRESH_POINTERS(x, y) \
|
||||
BO = B; \
|
||||
REFRESH_TEMP_BK(x, y)
|
||||
#else
|
||||
#define REFRESH_POINTERS(x, y) \
|
||||
AO += off * (2*x); \
|
||||
BO = B + off * (2*y); \
|
||||
REFRESH_TEMP_BK(x, y)
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
#define REFRESH_OFF(x) \
|
||||
off += x;
|
||||
#else
|
||||
#define REFRESH_OFF(x)
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
#define UPDATE_TEMP(x, y) \
|
||||
temp -= x;
|
||||
#else
|
||||
#define UPDATE_TEMP(x, y) \
|
||||
temp -= y;
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
#define REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||
temp = k - off; \
|
||||
UPDATE_TEMP(x, y) \
|
||||
AO += temp * (2*x); \
|
||||
BO += temp * (2*y);
|
||||
#else
|
||||
#define REFRESH_TMP_AFTER_SAVE(x, y)
|
||||
#endif
|
||||
|
||||
#define REFRESH_AFTER_SAVE(x,y) \
|
||||
REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||
REFRESH_OFF(x)
|
||||
/*************************************************************************************
|
||||
* GEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
|
||||
FLOAT * C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i1, i, l, temp;
|
||||
FLOAT *AO, *BO, *CO;
|
||||
#if defined(TRMMKERNEL)
|
||||
BLASLONG off;
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
|
||||
v4sf_t result[32];
|
||||
FLOAT *res, tr[16], ti[16];
|
||||
res = (FLOAT *) result;
|
||||
|
||||
for (i1 = 0; i1 < (n >> 1); i1++)
|
||||
{
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
AO = A;
|
||||
CO = C;
|
||||
C += ldc<<2;
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
|
||||
}
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
|
||||
__builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
|
||||
COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
|
||||
COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
|
||||
COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
|
||||
COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
|
||||
COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
|
||||
COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
|
||||
COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
|
||||
COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
|
||||
COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
|
||||
COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
|
||||
COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
|
||||
COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
|
||||
COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
|
||||
COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
|
||||
COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
|
||||
COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
|
||||
CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
|
||||
CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
|
||||
CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
|
||||
CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
|
||||
CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
|
||||
CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
|
||||
CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
|
||||
CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
|
||||
CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
|
||||
CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
|
||||
CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
|
||||
CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
|
||||
CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
|
||||
CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
|
||||
CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
|
||||
CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
|
||||
CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
|
||||
CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
|
||||
CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
|
||||
CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
|
||||
CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
|
||||
CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
|
||||
CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
|
||||
CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
|
||||
CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
|
||||
CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
|
||||
CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
|
||||
CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
|
||||
CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
|
||||
CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
|
||||
CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
|
||||
CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
|
||||
|
||||
AO += temp << 4;
|
||||
BO += temp << 2;
|
||||
CO += 16;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
|
||||
SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
|
||||
AO += temp << 3;
|
||||
BO += temp << 2;
|
||||
CO += 8;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
|
||||
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
|
||||
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
|
||||
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_22_1
|
||||
AO += temp << 2;
|
||||
BO += temp << 2;
|
||||
CO += 4;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
|
||||
vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
|
||||
vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
|
||||
vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
|
||||
vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_12
|
||||
AO += temp << 1;
|
||||
BO += temp << 2;
|
||||
CO += 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 2)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
B += k << 2;
|
||||
}
|
||||
if (n & 1)
|
||||
{
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
AO = A;
|
||||
CO = C;
|
||||
C += ldc<<1;
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
|
||||
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
|
||||
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
|
||||
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_21_4
|
||||
|
||||
AO += temp << 4;
|
||||
BO += temp << 1;
|
||||
CO += 16;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
|
||||
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
|
||||
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
|
||||
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
|
||||
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_21_2
|
||||
AO += temp << 3;
|
||||
BO += temp << 1;
|
||||
CO += 8;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
} if (m & 2)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
|
||||
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
|
||||
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
|
||||
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
|
||||
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
|
||||
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
|
||||
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
|
||||
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
|
||||
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
|
||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_21_1
|
||||
AO += temp << 2;
|
||||
BO += temp << 1;
|
||||
CO += 4;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||
__vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
|
||||
__vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
|
||||
__vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
|
||||
__vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
|
||||
__vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
|
||||
vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
|
||||
vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
|
||||
vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
|
||||
vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
|
||||
vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
|
||||
vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
|
||||
__builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
|
||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
}
|
||||
SAVE_ACC_COMPLEX_11
|
||||
AO += temp << 1;
|
||||
BO += temp << 1;
|
||||
CO += 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 1)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
B += k << 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -104,7 +104,7 @@
|
|||
*
|
||||
READ( NIN, FMT = * )SUMMRY
|
||||
READ( NIN, FMT = * )NOUT
|
||||
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
|
||||
OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
|
||||
NOUTC = NOUT
|
||||
*
|
||||
* Read name and unit number for snapshot output file and open file.
|
||||
|
@ -113,7 +113,7 @@
|
|||
READ( NIN, FMT = * )NTRA
|
||||
TRACE = NTRA.GE.0
|
||||
IF( TRACE )THEN
|
||||
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
|
||||
OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
|
||||
END IF
|
||||
* Read the flag that directs rewinding of the snapshot file.
|
||||
READ( NIN, FMT = * )REWI
|
||||
|
@ -3439,4 +3439,3 @@
|
|||
* End of XERBLA
|
||||
*
|
||||
END
|
||||
|
||||
|
|
|
@ -105,7 +105,7 @@
|
|||
*
|
||||
READ( NIN, FMT = * )SUMMRY
|
||||
READ( NIN, FMT = * )NOUT
|
||||
OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
|
||||
OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' )
|
||||
NOUTC = NOUT
|
||||
*
|
||||
* Read name and unit number for snapshot output file and open file.
|
||||
|
@ -114,7 +114,7 @@
|
|||
READ( NIN, FMT = * )NTRA
|
||||
TRACE = NTRA.GE.0
|
||||
IF( TRACE )THEN
|
||||
OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
|
||||
OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' )
|
||||
END IF
|
||||
* Read the flag that directs rewinding of the snapshot file.
|
||||
READ( NIN, FMT = * )REWI
|
||||
|
|
|
@ -81,6 +81,28 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
|
|||
|
||||
ldc *= 2;
|
||||
|
||||
#ifndef NO_CBLAS
|
||||
if (order == CblasRowMajor) {
|
||||
if (uplo == 'U' || uplo == CblasUpper)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = i * 2; j < m * 2; j+=2){
|
||||
data_cgemmt.c_verify[i * ldc + j] =
|
||||
data_cgemmt.c_gemm[i * ldc + j];
|
||||
data_cgemmt.c_verify[i * ldc + j + 1] =
|
||||
data_cgemmt.c_gemm[i * ldc + j + 1];
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = 0; j <= i * 2; j+=2){
|
||||
data_cgemmt.c_verify[i * ldc + j] =
|
||||
data_cgemmt.c_gemm[i * ldc + j];
|
||||
data_cgemmt.c_verify[i * ldc + j + 1] =
|
||||
data_cgemmt.c_gemm[i * ldc + j + 1];
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (uplo == 'L' || uplo == CblasLower)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
|
|
|
@ -77,6 +77,21 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
|
|||
else
|
||||
cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda,
|
||||
data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc);
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
if (uplo == 'U' || uplo == CblasUpper)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = i; j < m; j++)
|
||||
data_dgemmt.c_verify[i * ldc + j] =
|
||||
data_dgemmt.c_gemm[i * ldc + j];
|
||||
} else {
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = 0; j <= i; j++)
|
||||
data_dgemmt.c_verify[i * ldc + j] =
|
||||
data_dgemmt.c_gemm[i * ldc + j];
|
||||
}
|
||||
}else
|
||||
#endif
|
||||
|
||||
if (uplo == 'L' || uplo == CblasLower)
|
||||
|
|
|
@ -77,6 +77,21 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
|
|||
else
|
||||
cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda,
|
||||
data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc);
|
||||
if (order == CblasRowMajor) {
|
||||
if (uplo == 'U' || uplo == CblasUpper)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = i; j < m; j++)
|
||||
data_sgemmt.c_verify[i * ldc + j] =
|
||||
data_sgemmt.c_gemm[i * ldc + j];
|
||||
} else {
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = 0; j <= i; j++)
|
||||
data_sgemmt.c_verify[i * ldc + j] =
|
||||
data_sgemmt.c_gemm[i * ldc + j];
|
||||
}
|
||||
|
||||
} else
|
||||
#endif
|
||||
|
||||
if (uplo == 'L' || uplo == CblasLower)
|
||||
|
|
|
@ -80,7 +80,28 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra
|
|||
#endif
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
#ifndef NO_CBLAS
|
||||
if (order == CblasRowMajor) {
|
||||
if (uplo == 'U' || uplo == CblasUpper)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = i * 2; j < m * 2; j+=2){
|
||||
data_zgemmt.c_verify[i * ldc + j] =
|
||||
data_zgemmt.c_gemm[i * ldc + j];
|
||||
data_zgemmt.c_verify[i * ldc + j + 1] =
|
||||
data_zgemmt.c_gemm[i * ldc + j + 1];
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < m; i++)
|
||||
for (j = 0; j <= i * 2; j+=2){
|
||||
data_zgemmt.c_verify[i * ldc + j] =
|
||||
data_zgemmt.c_gemm[i * ldc + j];
|
||||
data_zgemmt.c_verify[i * ldc + j + 1] =
|
||||
data_zgemmt.c_gemm[i * ldc + j + 1];
|
||||
}
|
||||
}
|
||||
}else
|
||||
#endif
|
||||
if (uplo == 'L' || uplo == CblasLower)
|
||||
{
|
||||
for (i = 0; i < m; i++)
|
||||
|
|
Loading…
Reference in New Issue