Merge branch 'develop' into betterPowerGEMVTail

This commit is contained in:
Chip Kerchner 2024-08-15 08:00:00 -05:00
commit a0aeba631d
7 changed files with 138 additions and 29 deletions

View File

@ -89,20 +89,21 @@ task:
type: text/plain
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- brew install android-ndk
- brew install --cask android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- ls /System/Volumes/Data/opt/homebrew
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
- ls /opt/homebrew
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:

View File

@ -85,6 +85,8 @@ Examples:
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.
### Debug version
A debug version can be built using `make DEBUG=1`.

View File

@ -1527,6 +1527,19 @@ int get_cpuname(void){
break;
case 10: //family 6 exmodel 10
switch (model) {
case 13: // Granite Rapids
if(support_amx_bf16())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 5: // Comet Lake H and S
case 6: // Comet Lake U
case 10: // Meteor Lake
@ -2352,8 +2365,22 @@ int get_coretype(void){
case 10:
switch (model) {
case 13: // Granite Rapids
if(support_amx_bf16())
return CORE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CORE_COOPERLAKE;
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
case 5: // Comet Lake H and S
case 6: // Comet Lake U
case 10: // Meteor Lake
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
@ -2362,6 +2389,7 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 0: // Meteor Lake
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())

View File

@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l
main_status[cpu] = MAIN_RUNNING1;
#endif
if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2);
//For target LOONGSON3R5, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)

View File

@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c
# clatrs3
lapackobjs2d="$lapackobjs2d
dgelqs
dgelst
dgeqp3rk
dgeqrs
dlaqp2rk
dlaqp3rk
dlarmm
@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d
# dlaqz4
lapackobjs2z="$lapackobjs2z
zgelqs
zgelst
zgeqp3rk
zgeqrs
zlaqp2rk
zlaqp3rk
zlatrs3
@ -918,6 +914,7 @@ lapack_extendedprecision_objs="
"
lapack_deprecated_objsc="
cgelqs cgeqrs
cgegs cggsvd
cgegv cggsvp
cgelsx clahrd
@ -926,6 +923,7 @@ lapack_deprecated_objsc="
"
lapack_deprecated_objsd="
dgelqs dgeqrs
dgegs dgeqpf
dgegv dggsvd
dgelsx dggsvp
@ -933,6 +931,8 @@ lapack_deprecated_objsd="
dlatzm dtzrqf"
lapack_deprecated_objss="
sgelqs
sgeqrs
sgelsx
sgegs
sgegv
@ -945,6 +945,8 @@ lapack_deprecated_objss="
"
lapack_deprecated_objsz="
zgelqs
zgeqrs
zgegs
zgegv
zgelsx

View File

@ -131,11 +131,11 @@
sd $21, 40($sp)
sd $22, 48($sp)
ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
sdc1 $f24, 56($sp)
sdc1 $f25, 64($sp)
sdc1 $f26, 72($sp)
sdc1 $f27, 80($sp)
sdc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
sd $23, 96($sp)
@ -146,10 +146,10 @@
#endif
#ifndef __64BIT__
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
sdc1 $f20,120($sp)
sdc1 $f21,128($sp)
sdc1 $f22,136($sp)
sdc1 $f23,144($sp)
#endif
.align 4
@ -4000,11 +4000,11 @@
ld $21, 40($sp)
ld $22, 48($sp)
LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
ldc1 $f24, 56($sp)
ldc1 $f25, 64($sp)
ldc1 $f26, 72($sp)
ldc1 $f27, 80($sp)
ldc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
ld $23, 96($sp)
@ -4013,10 +4013,10 @@
#endif
#ifndef __64BIT__
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
ldc1 $f20,120($sp)
ldc1 $f21,128($sp)
ldc1 $f22,136($sp)
ldc1 $f23,144($sp)
#endif
daddiu $sp,$sp,STACKSIZE

View File

@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "../common.h"
#define SGEMM BLASFUNC(sgemm)
#define SBGEMM BLASFUNC(sbgemm)
#define SGEMV BLASFUNC(sgemv)
#define SBGEMV BLASFUNC(sbgemv)
typedef union
{
unsigned short v;
@ -187,7 +189,79 @@ main (int argc, char *argv[])
free(CC);
}
if (ret != 0)
if (ret != 0) {
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
return ret;
}
k = 1;
for (x = 1; x <= loop; x++)
{
float *A = (float *)malloc(x * x * sizeof(FLOAT));
float *B = (float *)malloc(x * sizeof(FLOAT));
float *C = (float *)malloc(x * sizeof(FLOAT));
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits));
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits));
float *DD = (float *)malloc(x * sizeof(FLOAT));
float *CC = (float *)malloc(x * sizeof(FLOAT));
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
(DD == NULL) || (CC == NULL))
return 1;
bfloat16 atmp, btmp;
blasint one = 1;
for (j = 0; j < x; j++)
{
for (i = 0; i < x; i++)
{
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one);
AA[j * x + i].v = atmp;
}
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &B[j], &one, &btmp, &one);
BB[j].v = btmp;
}
for (y = 0; y < 2; y++)
{
if (y == 0) {
transA = 'N';
} else {
transA = 'T';
}
memset(CC, 0, x * sizeof(FLOAT));
memset(DD, 0, x * sizeof(FLOAT));
memset(C, 0, x * sizeof(FLOAT));
SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k);
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k);
for (j = 0; j < x; j++)
for (i = 0; i < x; i++)
if (transA == 'N') {
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]);
} else if (transA == 'T') {
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]);
}
for (j = 0; j < x; j++) {
if (fabs (CC[j] - C[j]) > 1.0)
ret++;
if (fabs (CC[j] - DD[j]) > 1.0)
ret++;
}
}
free(A);
free(B);
free(C);
free(AA);
free(BB);
free(DD);
free(CC);
}
if (ret != 0)
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret);
return ret;
}