diff --git a/.cirrus.yml b/.cirrus.yml index d0e1eeff7..17e4eb7e8 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -89,20 +89,21 @@ task: type: text/plain macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - - brew install android-ndk + - brew install --cask android-ndk - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - ls /System/Volumes/Data/opt/homebrew + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" + - ls /opt/homebrew - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" + - find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: diff --git a/README.md b/README.md index 169087cec..45bcf10e7 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,8 @@ Examples: make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 ``` +When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. + ### Debug version A debug version can be built using `make DEBUG=1`. diff --git a/cpuid_x86.c b/cpuid_x86.c index f77cca1d8..9b2b7a51e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1527,6 +1527,19 @@ int get_cpuname(void){ break; case 10: //family 6 exmodel 10 switch (model) { + case 13: // Granite Rapids + if(support_amx_bf16()) + return CPUTYPE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 5: // Comet Lake H and S case 6: // Comet Lake U case 10: // Meteor Lake @@ -2352,8 +2365,22 @@ int get_coretype(void){ case 10: switch (model) { + case 13: // Granite Rapids + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; case 5: // Comet Lake H and S case 6: // Comet Lake U + case 10: // Meteor Lake if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; @@ -2362,6 +2389,7 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 0: // Meteor Lake case 7:// Rocket Lake #ifndef NO_AVX512 if(support_avx512()) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 765511d8c..b9a7674c1 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l main_status[cpu] = MAIN_RUNNING1; #endif +if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2); + //For target LOONGSON3R5, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) diff --git a/exports/gensymbol b/exports/gensymbol index 28dd883f2..d53b98051 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c # clatrs3 lapackobjs2d="$lapackobjs2d - dgelqs dgelst dgeqp3rk - dgeqrs dlaqp2rk dlaqp3rk dlarmm @@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d # dlaqz4 lapackobjs2z="$lapackobjs2z - zgelqs zgelst zgeqp3rk - zgeqrs zlaqp2rk zlaqp3rk zlatrs3 @@ -918,6 +914,7 @@ lapack_extendedprecision_objs=" " lapack_deprecated_objsc=" + cgelqs cgeqrs cgegs cggsvd cgegv cggsvp cgelsx clahrd @@ -926,6 +923,7 @@ lapack_deprecated_objsc=" " lapack_deprecated_objsd=" + dgelqs dgeqrs dgegs dgeqpf dgegv dggsvd dgelsx dggsvp @@ -933,6 +931,8 @@ lapack_deprecated_objsd=" dlatzm dtzrqf" lapack_deprecated_objss=" + sgelqs + sgeqrs sgelsx sgegs sgegv @@ -945,6 +945,8 @@ lapack_deprecated_objss=" " lapack_deprecated_objsz=" + zgelqs + zgeqrs zgegs zgegv zgelsx diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 489b12445..f2d05faf8 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -131,11 +131,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -146,10 +146,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -4000,11 +4000,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -4013,10 +4013,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 395317441..cd508a0cf 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "../common.h" #define SGEMM BLASFUNC(sgemm) #define SBGEMM BLASFUNC(sbgemm) +#define SGEMV BLASFUNC(sgemv) +#define SBGEMV BLASFUNC(sbgemv) typedef union { unsigned short v; @@ -187,7 +189,79 @@ main (int argc, char *argv[]) free(CC); } - if (ret != 0) + if (ret != 0) { fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); + return ret; + } + + k = 1; + for (x = 1; x <= loop; x++) + { + float *A = (float *)malloc(x * x * sizeof(FLOAT)); + float *B = (float *)malloc(x * sizeof(FLOAT)); + float *C = (float *)malloc(x * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits)); + float *DD = (float *)malloc(x * sizeof(FLOAT)); + float *CC = (float *)malloc(x * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + bfloat16 atmp, btmp; + blasint one = 1; + + for (j = 0; j < x; j++) + { + for (i = 0; i < x; i++) + { + A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); + AA[j * x + i].v = atmp; + } + B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j], &one, &btmp, &one); + BB[j].v = btmp; + } + for (y = 0; y < 2; y++) + { + if (y == 0) { + transA = 'N'; + } else { + transA = 'T'; + } + + memset(CC, 0, x * sizeof(FLOAT)); + memset(DD, 0, x * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT)); + + SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); + SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); + + for (j = 0; j < x; j++) + for (i = 0; i < x; i++) + if (transA == 'N') { + DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); + } else if (transA == 'T') { + DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); + } + + for (j = 0; j < x; j++) { + if (fabs (CC[j] - C[j]) > 1.0) + ret++; + if (fabs (CC[j] - DD[j]) > 1.0) + ret++; + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); + } + + if (ret != 0) + fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); return ret; }