diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..503f6e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +*.bak +*.pyc +*.o +*/build/ +cp_template/*.yaml \ No newline at end of file diff --git a/README.md b/README.md index 1c017fd..cc999fe 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,52 @@ --- -## 📥 如何参与提交? +## 🚀 快速上手 + +本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。为了快速让参赛者进入比赛状态,可选择实现三个核心算法的高性能版本: +- **ReduceSum**: 高精度归约求和 +- **SortPair**: 键值对稳定排序 +- **TopkPair**: 键值对TopK选择 + +### 📥 + +### 编译和测试 + +#### 1. 全量编译和运行 +```bash +# 编译并运行所有算法测试(默认行为) +./run.sh + +# 仅编译所有算法,不运行测试 +./run.sh --build-only + +# 编译并运行单个算法测试 +./run.sh --run_reduce # ReduceSum算法 +./run.sh --run_sort # SortPair算法 +./run.sh --run_topk # TopkPair算法 +``` + +#### 2. 单独编译和运行 +```bash +# 编译并运行ReduceSum算法(默认行为) +./run_reduce_sum.sh + +# 仅编译ReduceSum算法,不运行测试 +./run_reduce_sum.sh --build-only + +# 编译并运行SortPair正确性测试 +./run_sort_pair.sh --run correctness + +# 编译并运行TopkPair性能测试 +./run_topk_pair.sh --run performance +``` + +#### 3. 手动运行测试 +```bash +./build/test_reducesum [correctness|performance|all] +./build/test_sortpair [correctness|performance|all] +./build/test_topkpair [correctness|performance|all] +``` ### ✅ 参赛要求: - 提交内容必须可以在沐曦自研 GPU **曦云 C500** 上运行。 @@ -72,7 +117,7 @@ ## 🏅 排名规则 -- 比赛周期:2 个月 +- 比赛周期:2 个月 - 排名按累计得分排序,取前 12 名! 若得分相同: diff --git a/S1/ICTN0N/build/test_reducesum b/S1/ICTN0N/build/test_reducesum new file mode 100755 index 0000000..7d95691 Binary files /dev/null and b/S1/ICTN0N/build/test_reducesum differ diff --git a/S1/ICTN0N/build/test_sortpair b/S1/ICTN0N/build/test_sortpair new file mode 100755 index 0000000..67cb03c Binary files /dev/null and b/S1/ICTN0N/build/test_sortpair differ diff --git a/S1/ICTN0N/build/test_topkpair b/S1/ICTN0N/build/test_topkpair new file mode 100755 index 0000000..ee5a5d7 Binary files /dev/null and b/S1/ICTN0N/build/test_topkpair differ diff --git a/S1/ICTN0N/reduce_sum_performance.yaml b/S1/ICTN0N/reduce_sum_performance.yaml new file mode 100644 index 0000000..1e8822a --- /dev/null +++ b/S1/ICTN0N/reduce_sum_performance.yaml @@ -0,0 +1,26 @@ +# ReduceSum算法性能测试结果 +# 生成时间: 2025-09-03 22:34:18 + +algorithm: "ReduceSum" +data_types: + input: "float" + output: "float" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + time_ms: 0.048717 + throughput_gps: 20.526799 + data_type: "float" + - data_size: 134217728 + time_ms: 0.402560 + throughput_gps: 333.410496 + data_type: "float" + - data_size: 536870912 + time_ms: 1.346586 + throughput_gps: 398.690510 + data_type: "float" + - data_size: 1073741824 + time_ms: 2.639513 + throughput_gps: 406.795353 + data_type: "float" diff --git a/S1/ICTN0N/sort_pair_performance.yaml b/S1/ICTN0N/sort_pair_performance.yaml new file mode 100644 index 0000000..9af8853 --- /dev/null +++ b/S1/ICTN0N/sort_pair_performance.yaml @@ -0,0 +1,46 @@ +# SortPair算法性能测试结果 +# 生成时间: 2025-09-03 22:37:18 + +algorithm: "SortPair" +data_types: + key_type: "float" + value_type: "uint32_t" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + ascending: + time_ms: 0.351488 + throughput_gps: 2.845047 + descending: + time_ms: 0.343270 + throughput_gps: 2.913155 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + ascending: + time_ms: 22.273815 + throughput_gps: 6.025808 + descending: + time_ms: 22.494003 + throughput_gps: 5.966823 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + ascending: + time_ms: 88.856277 + throughput_gps: 6.042014 + descending: + time_ms: 89.913918 + throughput_gps: 5.970943 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + ascending: + time_ms: 181.409576 + throughput_gps: 5.918882 + descending: + time_ms: 183.428955 + throughput_gps: 5.853720 + key_type: "float" + value_type: "uint32_t" diff --git a/S1/ICTN0N/topk_pair_performance.yaml b/S1/ICTN0N/topk_pair_performance.yaml new file mode 100644 index 0000000..f8dab18 --- /dev/null +++ b/S1/ICTN0N/topk_pair_performance.yaml @@ -0,0 +1,210 @@ +# TopkPair算法性能测试结果 +# 生成时间: 2025-09-03 22:40:54 + +algorithm: "TopkPair" +data_types: + key_type: "float" + value_type: "uint32_t" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + k_value: 32 + ascending: + time_ms: 0.402509 + throughput_gps: 2.484418 + descending: + time_ms: 0.416307 + throughput_gps: 2.402072 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 50 + ascending: + time_ms: 0.404787 + throughput_gps: 2.470434 + descending: + time_ms: 0.414669 + throughput_gps: 2.411563 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 100 + ascending: + time_ms: 0.398336 + throughput_gps: 2.510443 + descending: + time_ms: 0.408320 + throughput_gps: 2.449060 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 256 + ascending: + time_ms: 0.410752 + throughput_gps: 2.434559 + descending: + time_ms: 0.403379 + throughput_gps: 2.479057 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 1024 + ascending: + time_ms: 0.391091 + throughput_gps: 2.556949 + descending: + time_ms: 0.391142 + throughput_gps: 2.556613 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 32 + ascending: + time_ms: 22.394062 + throughput_gps: 5.993452 + descending: + time_ms: 22.263729 + throughput_gps: 6.028538 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 50 + ascending: + time_ms: 22.379187 + throughput_gps: 5.997435 + descending: + time_ms: 22.228352 + throughput_gps: 6.038132 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 100 + ascending: + time_ms: 22.436581 + throughput_gps: 5.982094 + descending: + time_ms: 22.229326 + throughput_gps: 6.037868 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 256 + ascending: + time_ms: 22.463232 + throughput_gps: 5.974996 + descending: + time_ms: 22.319946 + throughput_gps: 6.013354 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 1024 + ascending: + time_ms: 22.468454 + throughput_gps: 5.973608 + descending: + time_ms: 22.335976 + throughput_gps: 6.009038 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 32 + ascending: + time_ms: 89.437294 + throughput_gps: 6.002763 + descending: + time_ms: 88.605972 + throughput_gps: 6.059083 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 50 + ascending: + time_ms: 89.460587 + throughput_gps: 6.001200 + descending: + time_ms: 88.546509 + throughput_gps: 6.063152 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 100 + ascending: + time_ms: 89.203011 + throughput_gps: 6.018529 + descending: + time_ms: 88.809097 + throughput_gps: 6.045224 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 256 + ascending: + time_ms: 89.500465 + throughput_gps: 5.998526 + descending: + time_ms: 88.743912 + throughput_gps: 6.049665 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 1024 + ascending: + time_ms: 89.405357 + throughput_gps: 6.004908 + descending: + time_ms: 88.446083 + throughput_gps: 6.070036 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 32 + ascending: + time_ms: 182.233307 + throughput_gps: 5.892127 + descending: + time_ms: 181.076950 + throughput_gps: 5.929754 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 50 + ascending: + time_ms: 182.273239 + throughput_gps: 5.890836 + descending: + time_ms: 180.944550 + throughput_gps: 5.934093 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 100 + ascending: + time_ms: 182.374191 + throughput_gps: 5.887576 + descending: + time_ms: 181.277100 + throughput_gps: 5.923207 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 256 + ascending: + time_ms: 182.349457 + throughput_gps: 5.888374 + descending: + time_ms: 181.248199 + throughput_gps: 5.924152 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 1024 + ascending: + time_ms: 182.378326 + throughput_gps: 5.887442 + descending: + time_ms: 181.025803 + throughput_gps: 5.931430 + key_type: "float" + value_type: "uint32_t" diff --git a/cp_guide.md b/cp_run_guide.md similarity index 51% rename from cp_guide.md rename to cp_run_guide.md index 3385ae0..b7c4ca2 100644 --- a/cp_guide.md +++ b/cp_run_guide.md @@ -1,59 +1,12 @@ # GPU 高性能并行计算算法优化竞赛 -## 🎯 竞赛概述 - -本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。参赛者可选择实现三个核心算法的高性能版本: -- **ReduceSum**: 高精度归约求和 -- **SortPair**: 键值对稳定排序 -- **TopkPair**: 键值对TopK选择 - -## 🚀 快速开始 - -### 编译和测试 - -#### 1. 全量编译和运行 -```bash -# 编译并运行所有算法测试(默认行为) -./build_and_run.sh - -# 仅编译所有算法,不运行测试 -./build_and_run.sh --build-only - -# 编译并运行单个算法测试 -./build_and_run.sh --run_reduce # ReduceSum算法 -./build_and_run.sh --run_sort # SortPair算法 -./build_and_run.sh --run_topk # TopkPair算法 -``` - -#### 2. 单独编译和运行 -```bash -# 编译并运行ReduceSum算法(默认行为) -./build_and_run_reduce_sum.sh - -# 仅编译ReduceSum算法,不运行测试 -./build_and_run_reduce_sum.sh --build-only - -# 编译并运行SortPair正确性测试 -./build_and_run_sort_pair.sh --run correctness - -# 编译并运行TopkPair性能测试 -./build_and_run_topk_pair.sh --run performance -``` - -#### 3. 手动运行测试 -```bash -./build/test_reducesum [correctness|performance|all] -./build/test_sortpair [correctness|performance|all] -./build/test_topkpair [correctness|performance|all] -``` - ## 📝 参赛指南 ### 实现位置 参赛者需要在以下文件中替换Thrust实现: -- `src/reduce_sum_algorithm.maca` - 替换Thrust归约求和 -- `src/sort_pair_algorithm.maca` - 替换Thrust稳定排序 -- `src/topk_pair_algorithm.maca` - 替换Thrust TopK选择 +- `reduce_sum_algorithm.maca` - 替换Thrust归约求和 +- `sort_pair_algorithm.maca` - 替换Thrust稳定排序 +- `topk_pair_algorithm.maca` - 替换Thrust TopK选择 ### 算法要求 见competition_parallel_algorithms.md @@ -92,25 +45,21 @@ - 各数据规模的详细性能数据 - 升序/降序分别统计(适用时) -## 📁 项目结构 +## 📁 提交内容结构 ``` -├── build_and_run.sh # 统一编译和运行脚本(默认编译+运行所有算法) -├── build_common.sh # 公共编译配置和函数 -├── build_and_run_reduce_sum.sh # ReduceSum独立编译和运行脚本 -├── build_and_run_sort_pair.sh # SortPair独立编译和运行脚本 -├── build_and_run_topk_pair.sh # TopkPair独立编译和运行脚本 +├── run.sh # 统一编译和运行脚本(默认编译+运行所有算法) ├── competition_parallel_algorithms.md # 详细题目说明 -├── src/ # 算法实现和工具文件 -│ ├── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 -│ ├── sort_pair_algorithm.maca # 2. SortPair测试程序 -│ ├── topk_pair_algorithm.maca # 3. TopkPair测试程序 +│── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 +│── sort_pair_algorithm.maca # 2. SortPair测试程序 +│── topk_pair_algorithm.maca # 3. TopkPair测试程序 +├── utils/ # 工具文件 │ ├── test_utils.h # 测试工具和CPU参考实现 │ ├── yaml_reporter.h # YAML性能报告生成器 │ └── performance_utils.h # 性能测试工具 -├── final_results/reduce_sum_results.yaml #ReduceSum性能数据 -├── final_results/sort_pair_results.yaml #替换Thrust稳定排序 -└── final_results/topk_pair_results.yaml #TopkPair性能数据 +├── reduce_sum_results.yaml #ReduceSum性能数据 +├── sort_pair_results.yaml #替换Thrust稳定排序 +└── topk_pair_results.yaml #TopkPair性能数据 ``` ## 🔧 开发工具 @@ -134,7 +83,7 @@ mxcc -O3 -std=c++17 --extended-lambda -Isrc |--------|--------|------| | `COMPILER` | `mxcc` | CUDA编译器路径 | | `COMPILER_FLAGS` | `-O3 -std=c++17 --extended-lambda` | 编译标志 | -| `INCLUDE_DIR` | `src` | 头文件目录 | +| `HEADER_DIR` | `utils` | 头文件目录 | | `BUILD_DIR` | `build` | 构建输出目录 | ### 调试模式 diff --git a/competition_parallel_algorithms.md b/cp_template/competition_parallel_algorithms.md similarity index 99% rename from competition_parallel_algorithms.md rename to cp_template/competition_parallel_algorithms.md index 6cf1efd..70bf630 100644 --- a/competition_parallel_algorithms.md +++ b/cp_template/competition_parallel_algorithms.md @@ -1,11 +1,11 @@ -# 题目: +# 样例赛题说明 + ## GPU高性能并行计算算法优化 要求参赛者通过一个或多个global kernel 函数(允许配套 device 辅助函数),实现高性能算法。 在正确性、稳定性前提下,比拼算法性能。 - # 1. ReduceSum算法优化 ```cpp template @@ -23,14 +23,12 @@ public: * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 * 假定输入d\_in数据量为num\_items - 注意事项 * 累计误差不大于cpu double golden基准的0.5% * 注意针对NAN和INF等异常值的处理 - 加分项 * 使用tensor core计算reduce @@ -62,14 +60,11 @@ public: * 需要校验结果正确性 * 结果必须稳定排序 - 加分项 * 支持其他不同数据类型的排序,如half、double、int32_t等 * 覆盖更全面的数据范围,提供良好稳定的性能表现 - - # 3. Topk Pair算法优化 ```cpp template @@ -95,7 +90,6 @@ public: * 结果必须稳定排序 - 加分项 * 支持其他不同数据类型的键值对,实现类型通用算法 diff --git a/run.sh b/cp_template/run.sh similarity index 99% rename from run.sh rename to cp_template/run.sh index d6b612a..a437ff8 100644 --- a/run.sh +++ b/cp_template/run.sh @@ -36,11 +36,11 @@ COMPILER=${COMPILER:-mxcc} COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} # ***** 这里是关键修改点1:头文件目录 ***** -# 现在头文件在 includes/ 目录下 +# 现在头文件在 utils/ 目录下 HEADER_DIR=${HEADER_DIR:-utils} # ***** 这里是关键修改点2:源文件目录 ***** -# 现在源文件在 algorithms/ 目录下 +# 现在源文件在 ./ 目录下 SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} BUILD_DIR=${BUILD_DIR:-build} diff --git a/utils/performance_utils.h b/cp_template/utils/performance_utils.h similarity index 100% rename from utils/performance_utils.h rename to cp_template/utils/performance_utils.h diff --git a/utils/test_utils.h b/cp_template/utils/test_utils.h similarity index 100% rename from utils/test_utils.h rename to cp_template/utils/test_utils.h diff --git a/utils/yaml_reporter.h b/cp_template/utils/yaml_reporter.h similarity index 100% rename from utils/yaml_reporter.h rename to cp_template/utils/yaml_reporter.h