Merge branch 'develop' into betterPowerGEMVTail
This commit is contained in:
commit
a0aeba631d
11
.cirrus.yml
11
.cirrus.yml
|
@ -89,20 +89,21 @@ task:
|
||||||
type: text/plain
|
type: text/plain
|
||||||
|
|
||||||
macos_instance:
|
macos_instance:
|
||||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
|
||||||
task:
|
task:
|
||||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||||
compile_script:
|
compile_script:
|
||||||
- brew install android-ndk
|
- brew install --cask android-ndk
|
||||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- ls /System/Volumes/Data/opt/homebrew
|
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
|
||||||
|
- ls /opt/homebrew
|
||||||
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
|
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
|
||||||
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||||
always:
|
always:
|
||||||
config_artifacts:
|
config_artifacts:
|
||||||
|
|
|
@ -85,6 +85,8 @@ Examples:
|
||||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||||
```
|
```
|
||||||
|
|
||||||
|
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.
|
||||||
|
|
||||||
### Debug version
|
### Debug version
|
||||||
|
|
||||||
A debug version can be built using `make DEBUG=1`.
|
A debug version can be built using `make DEBUG=1`.
|
||||||
|
|
28
cpuid_x86.c
28
cpuid_x86.c
|
@ -1527,6 +1527,19 @@ int get_cpuname(void){
|
||||||
break;
|
break;
|
||||||
case 10: //family 6 exmodel 10
|
case 10: //family 6 exmodel 10
|
||||||
switch (model) {
|
switch (model) {
|
||||||
|
case 13: // Granite Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CPUTYPE_SAPPHIRERAPIDS;
|
||||||
|
if(support_avx512_bf16())
|
||||||
|
return CPUTYPE_COOPERLAKE;
|
||||||
|
if(support_avx512())
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
case 5: // Comet Lake H and S
|
case 5: // Comet Lake H and S
|
||||||
case 6: // Comet Lake U
|
case 6: // Comet Lake U
|
||||||
case 10: // Meteor Lake
|
case 10: // Meteor Lake
|
||||||
|
@ -2352,8 +2365,22 @@ int get_coretype(void){
|
||||||
|
|
||||||
case 10:
|
case 10:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
|
case 13: // Granite Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CORE_SAPPHIRERAPIDS;
|
||||||
|
if(support_avx512_bf16())
|
||||||
|
return CORE_COOPERLAKE;
|
||||||
|
if(support_avx512())
|
||||||
|
return CORE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CORE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
case 5: // Comet Lake H and S
|
case 5: // Comet Lake H and S
|
||||||
case 6: // Comet Lake U
|
case 6: // Comet Lake U
|
||||||
|
case 10: // Meteor Lake
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
return CORE_HASWELL;
|
return CORE_HASWELL;
|
||||||
|
@ -2362,6 +2389,7 @@ int get_coretype(void){
|
||||||
#endif
|
#endif
|
||||||
else
|
else
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
|
case 0: // Meteor Lake
|
||||||
case 7:// Rocket Lake
|
case 7:// Rocket Lake
|
||||||
#ifndef NO_AVX512
|
#ifndef NO_AVX512
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
|
|
|
@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l
|
||||||
main_status[cpu] = MAIN_RUNNING1;
|
main_status[cpu] = MAIN_RUNNING1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2);
|
||||||
|
|
||||||
//For target LOONGSON3R5, applying an offset to the buffer is essential
|
//For target LOONGSON3R5, applying an offset to the buffer is essential
|
||||||
//for minimizing cache conflicts and optimizing performance.
|
//for minimizing cache conflicts and optimizing performance.
|
||||||
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
|
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
|
||||||
|
|
|
@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c
|
||||||
# clatrs3
|
# clatrs3
|
||||||
|
|
||||||
lapackobjs2d="$lapackobjs2d
|
lapackobjs2d="$lapackobjs2d
|
||||||
dgelqs
|
|
||||||
dgelst
|
dgelst
|
||||||
dgeqp3rk
|
dgeqp3rk
|
||||||
dgeqrs
|
|
||||||
dlaqp2rk
|
dlaqp2rk
|
||||||
dlaqp3rk
|
dlaqp3rk
|
||||||
dlarmm
|
dlarmm
|
||||||
|
@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d
|
||||||
# dlaqz4
|
# dlaqz4
|
||||||
|
|
||||||
lapackobjs2z="$lapackobjs2z
|
lapackobjs2z="$lapackobjs2z
|
||||||
zgelqs
|
|
||||||
zgelst
|
zgelst
|
||||||
zgeqp3rk
|
zgeqp3rk
|
||||||
zgeqrs
|
|
||||||
zlaqp2rk
|
zlaqp2rk
|
||||||
zlaqp3rk
|
zlaqp3rk
|
||||||
zlatrs3
|
zlatrs3
|
||||||
|
@ -918,6 +914,7 @@ lapack_extendedprecision_objs="
|
||||||
"
|
"
|
||||||
|
|
||||||
lapack_deprecated_objsc="
|
lapack_deprecated_objsc="
|
||||||
|
cgelqs cgeqrs
|
||||||
cgegs cggsvd
|
cgegs cggsvd
|
||||||
cgegv cggsvp
|
cgegv cggsvp
|
||||||
cgelsx clahrd
|
cgelsx clahrd
|
||||||
|
@ -926,6 +923,7 @@ lapack_deprecated_objsc="
|
||||||
"
|
"
|
||||||
|
|
||||||
lapack_deprecated_objsd="
|
lapack_deprecated_objsd="
|
||||||
|
dgelqs dgeqrs
|
||||||
dgegs dgeqpf
|
dgegs dgeqpf
|
||||||
dgegv dggsvd
|
dgegv dggsvd
|
||||||
dgelsx dggsvp
|
dgelsx dggsvp
|
||||||
|
@ -933,6 +931,8 @@ lapack_deprecated_objsd="
|
||||||
dlatzm dtzrqf"
|
dlatzm dtzrqf"
|
||||||
|
|
||||||
lapack_deprecated_objss="
|
lapack_deprecated_objss="
|
||||||
|
sgelqs
|
||||||
|
sgeqrs
|
||||||
sgelsx
|
sgelsx
|
||||||
sgegs
|
sgegs
|
||||||
sgegv
|
sgegv
|
||||||
|
@ -945,6 +945,8 @@ lapack_deprecated_objss="
|
||||||
"
|
"
|
||||||
|
|
||||||
lapack_deprecated_objsz="
|
lapack_deprecated_objsz="
|
||||||
|
zgelqs
|
||||||
|
zgeqrs
|
||||||
zgegs
|
zgegs
|
||||||
zgegv
|
zgegv
|
||||||
zgelsx
|
zgelsx
|
||||||
|
|
|
@ -131,11 +131,11 @@
|
||||||
sd $21, 40($sp)
|
sd $21, 40($sp)
|
||||||
sd $22, 48($sp)
|
sd $22, 48($sp)
|
||||||
|
|
||||||
ST $f24, 56($sp)
|
sdc1 $f24, 56($sp)
|
||||||
ST $f25, 64($sp)
|
sdc1 $f25, 64($sp)
|
||||||
ST $f26, 72($sp)
|
sdc1 $f26, 72($sp)
|
||||||
ST $f27, 80($sp)
|
sdc1 $f27, 80($sp)
|
||||||
ST $f28, 88($sp)
|
sdc1 $f28, 88($sp)
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
sd $23, 96($sp)
|
sd $23, 96($sp)
|
||||||
|
@ -146,10 +146,10 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
ST $f20,120($sp)
|
sdc1 $f20,120($sp)
|
||||||
ST $f21,128($sp)
|
sdc1 $f21,128($sp)
|
||||||
ST $f22,136($sp)
|
sdc1 $f22,136($sp)
|
||||||
ST $f23,144($sp)
|
sdc1 $f23,144($sp)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
|
@ -4000,11 +4000,11 @@
|
||||||
ld $21, 40($sp)
|
ld $21, 40($sp)
|
||||||
ld $22, 48($sp)
|
ld $22, 48($sp)
|
||||||
|
|
||||||
LD $f24, 56($sp)
|
ldc1 $f24, 56($sp)
|
||||||
LD $f25, 64($sp)
|
ldc1 $f25, 64($sp)
|
||||||
LD $f26, 72($sp)
|
ldc1 $f26, 72($sp)
|
||||||
LD $f27, 80($sp)
|
ldc1 $f27, 80($sp)
|
||||||
LD $f28, 88($sp)
|
ldc1 $f28, 88($sp)
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
ld $23, 96($sp)
|
ld $23, 96($sp)
|
||||||
|
@ -4013,10 +4013,10 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
LD $f20,120($sp)
|
ldc1 $f20,120($sp)
|
||||||
LD $f21,128($sp)
|
ldc1 $f21,128($sp)
|
||||||
LD $f22,136($sp)
|
ldc1 $f22,136($sp)
|
||||||
LD $f23,144($sp)
|
ldc1 $f23,144($sp)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
daddiu $sp,$sp,STACKSIZE
|
daddiu $sp,$sp,STACKSIZE
|
||||||
|
|
|
@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "../common.h"
|
#include "../common.h"
|
||||||
#define SGEMM BLASFUNC(sgemm)
|
#define SGEMM BLASFUNC(sgemm)
|
||||||
#define SBGEMM BLASFUNC(sbgemm)
|
#define SBGEMM BLASFUNC(sbgemm)
|
||||||
|
#define SGEMV BLASFUNC(sgemv)
|
||||||
|
#define SBGEMV BLASFUNC(sbgemv)
|
||||||
typedef union
|
typedef union
|
||||||
{
|
{
|
||||||
unsigned short v;
|
unsigned short v;
|
||||||
|
@ -187,7 +189,79 @@ main (int argc, char *argv[])
|
||||||
free(CC);
|
free(CC);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ret != 0)
|
if (ret != 0) {
|
||||||
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
k = 1;
|
||||||
|
for (x = 1; x <= loop; x++)
|
||||||
|
{
|
||||||
|
float *A = (float *)malloc(x * x * sizeof(FLOAT));
|
||||||
|
float *B = (float *)malloc(x * sizeof(FLOAT));
|
||||||
|
float *C = (float *)malloc(x * sizeof(FLOAT));
|
||||||
|
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits));
|
||||||
|
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits));
|
||||||
|
float *DD = (float *)malloc(x * sizeof(FLOAT));
|
||||||
|
float *CC = (float *)malloc(x * sizeof(FLOAT));
|
||||||
|
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
|
||||||
|
(DD == NULL) || (CC == NULL))
|
||||||
|
return 1;
|
||||||
|
bfloat16 atmp, btmp;
|
||||||
|
blasint one = 1;
|
||||||
|
|
||||||
|
for (j = 0; j < x; j++)
|
||||||
|
{
|
||||||
|
for (i = 0; i < x; i++)
|
||||||
|
{
|
||||||
|
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||||
|
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one);
|
||||||
|
AA[j * x + i].v = atmp;
|
||||||
|
}
|
||||||
|
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||||
|
sbstobf16_(&one, &B[j], &one, &btmp, &one);
|
||||||
|
BB[j].v = btmp;
|
||||||
|
}
|
||||||
|
for (y = 0; y < 2; y++)
|
||||||
|
{
|
||||||
|
if (y == 0) {
|
||||||
|
transA = 'N';
|
||||||
|
} else {
|
||||||
|
transA = 'T';
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(CC, 0, x * sizeof(FLOAT));
|
||||||
|
memset(DD, 0, x * sizeof(FLOAT));
|
||||||
|
memset(C, 0, x * sizeof(FLOAT));
|
||||||
|
|
||||||
|
SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k);
|
||||||
|
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k);
|
||||||
|
|
||||||
|
for (j = 0; j < x; j++)
|
||||||
|
for (i = 0; i < x; i++)
|
||||||
|
if (transA == 'N') {
|
||||||
|
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]);
|
||||||
|
} else if (transA == 'T') {
|
||||||
|
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j < x; j++) {
|
||||||
|
if (fabs (CC[j] - C[j]) > 1.0)
|
||||||
|
ret++;
|
||||||
|
if (fabs (CC[j] - DD[j]) > 1.0)
|
||||||
|
ret++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(A);
|
||||||
|
free(B);
|
||||||
|
free(C);
|
||||||
|
free(AA);
|
||||||
|
free(BB);
|
||||||
|
free(DD);
|
||||||
|
free(CC);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret != 0)
|
||||||
|
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue