Merge branch 'develop' into betterPowerGEMVTail
This commit is contained in:
commit
a0aeba631d
13
.cirrus.yml
13
.cirrus.yml
|
@ -89,20 +89,21 @@ task:
|
|||
type: text/plain
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||
compile_script:
|
||||
- brew install android-ndk
|
||||
- brew install --cask android-ndk
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- ls /System/Volumes/Data/opt/homebrew
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
|
||||
- ls /opt/homebrew
|
||||
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
|
||||
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
|
|
|
@ -85,6 +85,8 @@ Examples:
|
|||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
```
|
||||
|
||||
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.
|
||||
|
||||
### Debug version
|
||||
|
||||
A debug version can be built using `make DEBUG=1`.
|
||||
|
|
28
cpuid_x86.c
28
cpuid_x86.c
|
@ -1527,6 +1527,19 @@ int get_cpuname(void){
|
|||
break;
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 13: // Granite Rapids
|
||||
if(support_amx_bf16())
|
||||
return CPUTYPE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
case 10: // Meteor Lake
|
||||
|
@ -2352,8 +2365,22 @@ int get_coretype(void){
|
|||
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 13: // Granite Rapids
|
||||
if(support_amx_bf16())
|
||||
return CORE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
case 10: // Meteor Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
|
@ -2362,6 +2389,7 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 0: // Meteor Lake
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
|
|
|
@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l
|
|||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
|
||||
if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2);
|
||||
|
||||
//For target LOONGSON3R5, applying an offset to the buffer is essential
|
||||
//for minimizing cache conflicts and optimizing performance.
|
||||
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
|
||||
|
|
|
@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c
|
|||
# clatrs3
|
||||
|
||||
lapackobjs2d="$lapackobjs2d
|
||||
dgelqs
|
||||
dgelst
|
||||
dgeqp3rk
|
||||
dgeqrs
|
||||
dlaqp2rk
|
||||
dlaqp3rk
|
||||
dlarmm
|
||||
|
@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d
|
|||
# dlaqz4
|
||||
|
||||
lapackobjs2z="$lapackobjs2z
|
||||
zgelqs
|
||||
zgelst
|
||||
zgeqp3rk
|
||||
zgeqrs
|
||||
zlaqp2rk
|
||||
zlaqp3rk
|
||||
zlatrs3
|
||||
|
@ -918,6 +914,7 @@ lapack_extendedprecision_objs="
|
|||
"
|
||||
|
||||
lapack_deprecated_objsc="
|
||||
cgelqs cgeqrs
|
||||
cgegs cggsvd
|
||||
cgegv cggsvp
|
||||
cgelsx clahrd
|
||||
|
@ -926,6 +923,7 @@ lapack_deprecated_objsc="
|
|||
"
|
||||
|
||||
lapack_deprecated_objsd="
|
||||
dgelqs dgeqrs
|
||||
dgegs dgeqpf
|
||||
dgegv dggsvd
|
||||
dgelsx dggsvp
|
||||
|
@ -933,6 +931,8 @@ lapack_deprecated_objsd="
|
|||
dlatzm dtzrqf"
|
||||
|
||||
lapack_deprecated_objss="
|
||||
sgelqs
|
||||
sgeqrs
|
||||
sgelsx
|
||||
sgegs
|
||||
sgegv
|
||||
|
@ -945,6 +945,8 @@ lapack_deprecated_objss="
|
|||
"
|
||||
|
||||
lapack_deprecated_objsz="
|
||||
zgelqs
|
||||
zgeqrs
|
||||
zgegs
|
||||
zgegv
|
||||
zgelsx
|
||||
|
|
|
@ -131,11 +131,11 @@
|
|||
sd $21, 40($sp)
|
||||
sd $22, 48($sp)
|
||||
|
||||
ST $f24, 56($sp)
|
||||
ST $f25, 64($sp)
|
||||
ST $f26, 72($sp)
|
||||
ST $f27, 80($sp)
|
||||
ST $f28, 88($sp)
|
||||
sdc1 $f24, 56($sp)
|
||||
sdc1 $f25, 64($sp)
|
||||
sdc1 $f26, 72($sp)
|
||||
sdc1 $f27, 80($sp)
|
||||
sdc1 $f28, 88($sp)
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
sd $23, 96($sp)
|
||||
|
@ -146,10 +146,10 @@
|
|||
#endif
|
||||
|
||||
#ifndef __64BIT__
|
||||
ST $f20,120($sp)
|
||||
ST $f21,128($sp)
|
||||
ST $f22,136($sp)
|
||||
ST $f23,144($sp)
|
||||
sdc1 $f20,120($sp)
|
||||
sdc1 $f21,128($sp)
|
||||
sdc1 $f22,136($sp)
|
||||
sdc1 $f23,144($sp)
|
||||
#endif
|
||||
|
||||
.align 4
|
||||
|
@ -4000,11 +4000,11 @@
|
|||
ld $21, 40($sp)
|
||||
ld $22, 48($sp)
|
||||
|
||||
LD $f24, 56($sp)
|
||||
LD $f25, 64($sp)
|
||||
LD $f26, 72($sp)
|
||||
LD $f27, 80($sp)
|
||||
LD $f28, 88($sp)
|
||||
ldc1 $f24, 56($sp)
|
||||
ldc1 $f25, 64($sp)
|
||||
ldc1 $f26, 72($sp)
|
||||
ldc1 $f27, 80($sp)
|
||||
ldc1 $f28, 88($sp)
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
ld $23, 96($sp)
|
||||
|
@ -4013,10 +4013,10 @@
|
|||
#endif
|
||||
|
||||
#ifndef __64BIT__
|
||||
LD $f20,120($sp)
|
||||
LD $f21,128($sp)
|
||||
LD $f22,136($sp)
|
||||
LD $f23,144($sp)
|
||||
ldc1 $f20,120($sp)
|
||||
ldc1 $f21,128($sp)
|
||||
ldc1 $f22,136($sp)
|
||||
ldc1 $f23,144($sp)
|
||||
#endif
|
||||
|
||||
daddiu $sp,$sp,STACKSIZE
|
||||
|
|
|
@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "../common.h"
|
||||
#define SGEMM BLASFUNC(sgemm)
|
||||
#define SBGEMM BLASFUNC(sbgemm)
|
||||
#define SGEMV BLASFUNC(sgemv)
|
||||
#define SBGEMV BLASFUNC(sbgemv)
|
||||
typedef union
|
||||
{
|
||||
unsigned short v;
|
||||
|
@ -187,7 +189,79 @@ main (int argc, char *argv[])
|
|||
free(CC);
|
||||
}
|
||||
|
||||
if (ret != 0)
|
||||
if (ret != 0) {
|
||||
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
k = 1;
|
||||
for (x = 1; x <= loop; x++)
|
||||
{
|
||||
float *A = (float *)malloc(x * x * sizeof(FLOAT));
|
||||
float *B = (float *)malloc(x * sizeof(FLOAT));
|
||||
float *C = (float *)malloc(x * sizeof(FLOAT));
|
||||
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits));
|
||||
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits));
|
||||
float *DD = (float *)malloc(x * sizeof(FLOAT));
|
||||
float *CC = (float *)malloc(x * sizeof(FLOAT));
|
||||
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
|
||||
(DD == NULL) || (CC == NULL))
|
||||
return 1;
|
||||
bfloat16 atmp, btmp;
|
||||
blasint one = 1;
|
||||
|
||||
for (j = 0; j < x; j++)
|
||||
{
|
||||
for (i = 0; i < x; i++)
|
||||
{
|
||||
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one);
|
||||
AA[j * x + i].v = atmp;
|
||||
}
|
||||
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||
sbstobf16_(&one, &B[j], &one, &btmp, &one);
|
||||
BB[j].v = btmp;
|
||||
}
|
||||
for (y = 0; y < 2; y++)
|
||||
{
|
||||
if (y == 0) {
|
||||
transA = 'N';
|
||||
} else {
|
||||
transA = 'T';
|
||||
}
|
||||
|
||||
memset(CC, 0, x * sizeof(FLOAT));
|
||||
memset(DD, 0, x * sizeof(FLOAT));
|
||||
memset(C, 0, x * sizeof(FLOAT));
|
||||
|
||||
SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k);
|
||||
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k);
|
||||
|
||||
for (j = 0; j < x; j++)
|
||||
for (i = 0; i < x; i++)
|
||||
if (transA == 'N') {
|
||||
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]);
|
||||
} else if (transA == 'T') {
|
||||
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]);
|
||||
}
|
||||
|
||||
for (j = 0; j < x; j++) {
|
||||
if (fabs (CC[j] - C[j]) > 1.0)
|
||||
ret++;
|
||||
if (fabs (CC[j] - DD[j]) > 1.0)
|
||||
ret++;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
free(B);
|
||||
free(C);
|
||||
free(AA);
|
||||
free(BB);
|
||||
free(DD);
|
||||
free(CC);
|
||||
}
|
||||
|
||||
if (ret != 0)
|
||||
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue