[WIP] 重构样板赛题

2025-09-03 23:03:47 +08:00
parent ab43e17fc9
commit 79e4fd6ab1
14 changed files with 352 additions and 76 deletions
--- a/cp_template/competition_parallel_algorithms.md
+++ b/cp_template/competition_parallel_algorithms.md
@@ -0,0 +1,97 @@
+# 样例赛题说明
+
+## GPU高性能并行计算算法优化
+
+要求参赛者通过一个或多个global kernel 函数（允许配套 device 辅助函数），实现高性能算法。
+
+在正确性、稳定性前提下，比拼算法性能。
+
+# 1. ReduceSum算法优化
+```cpp
+template <typename InputT = float, typename OutputT = float>
+class ReduceSumAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) {
+        // TODO
+    }
+};
+```
+其中
+
+* 数据类型：InputT: float,  OutputT: float
+* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能
+* 假定输入d\_in数据量为num\_items
+
+注意事项
+
+* 累计误差不大于cpu double golden基准的0.5%
+* 注意针对NAN和INF等异常值的处理
+
+
+加分项
+
+* 使用tensor core计算reduce
+* 覆盖更全面的数据范围，提供良好稳定的性能表现
+
+
+# 2. Sort Pair算法优化
+```cpp
+template <typename KeyType, typename ValueType>
+class SortPairAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void sort(const KeyType* d_keys_in, KeyType* d_keys_out,
+              const ValueType* d_values_in, ValueType* d_values_out,
+              int num_items, bool descending) {
+                // TODO
+              }
+};
+```
+其中
+
+* 数据类型：key: float, value: int32\_t
+* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能
+* 假定输入、输出的key和value的数据量一致，均为num\_items
+
+
+注意事项
+
+* 需要校验结果正确性
+* 结果必须稳定排序
+
+加分项
+
+* 支持其他不同数据类型的排序，如half、double、int32_t等
+* 覆盖更全面的数据范围，提供良好稳定的性能表现
+
+# 3. Topk Pair算法优化
+```cpp
+template <typename KeyType, typename ValueType>
+class TopkPairAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void topk(const KeyType* d_keys_in, KeyType* d_keys_out,
+              const ValueType* d_values_in, ValueType* d_values_out,
+              int num_items, int k, bool descending) {
+              // TODO
+              }
+};
+```
+其中
+
+* 数据类型：key: float, value: int32\_t
+* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能
+* 假定输入的key和value的数据量一致，为num\_items；输出的key和value的数据量一致，为k
+* k的范围：32，50，100，256，1024。k不大于num\_items
+
+
+注意事项
+
+* 结果必须稳定排序
+
+加分项
+
+* 支持其他不同数据类型的键值对，实现类型通用算法
+* 覆盖更全面的数据范围，提供良好稳定的性能表现
+
--- a/cp_template/run.sh
+++ b/cp_template/run.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# GPU高性能并行计算算法优化竞赛 - 统一编译和运行脚本
+# 整合了所有算法的编译、运行和公共配置
+
+# ============================================================================
+# 公共配置和工具函数
+# ============================================================================
+
+# 设置颜色
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[0;33m'
+NC='\033[0m' # No Color
+
+# 打印函数
+print_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+# 编译配置 - 可通过环境变量自定义
+COMPILER=${COMPILER:-mxcc}
+COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} 
+
+# ***** 这里是关键修改点1：头文件目录 *****
+# 现在头文件在 utils/ 目录下
+HEADER_DIR=${HEADER_DIR:-utils} 
+
+# ***** 这里是关键修改点2：源文件目录 *****
+# 现在源文件在 ./ 目录下
+SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} 
+
+BUILD_DIR=${BUILD_DIR:-build}
+
+# 编译单个算法的通用函数
+# 参数: $1=算法名称, $2=源文件名（不含路径）
+compile_algorithm() {
+    local algo_name="$1"
+    local source_file_name="$2" # 例如 "reduce_sum_algorithm.maca"
+    local target_file="$BUILD_DIR/test_${algo_name,,}"  # 转换为小写
+    
+    print_info "编译 $algo_name 算法..."
+    
+    # 创建构建目录
+    mkdir -p "$BUILD_DIR"
+    
+    # ***** 这里是关键修改点3：编译命令 *****
+    # -I$HEADER_DIR 用于告诉编译器头文件在哪里
+    # $SOURCE_CODE_DIR/$source_file_name 用于指定要编译的源文件的完整路径
+    local compile_cmd="$COMPILER $COMPILER_FLAGS -I$HEADER_DIR $source_file_name -o $target_file"
+    
+    print_info "执行: $compile_cmd"
+    
+    if $compile_cmd; then
+        print_success "$algo_name 编译完成!"
+        echo ""
+        echo "运行测试:"
+        echo "   ./$target_file [correctness|performance|all]"
+        return 0
+    else
+        print_error "$algo_name 编译失败!"
+        return 1
+    fi
+}
+
+# 显示编译配置信息
+show_build_config() {
+    print_info "编译配置:"
+    echo "   COMPILER: $COMPILER"
+    echo "   COMPILER_FLAGS: $COMPILER_FLAGS"
+    echo "   HEADER_DIR: $HEADER_DIR" # 显示头文件目录
+    echo "   SOURCE_CODE_DIR: $SOURCE_CODE_DIR" # 显示源文件目录
+    echo "   BUILD_DIR: $BUILD_DIR"
+    echo ""
+}
+
+# 运行单个测试
+run_single_test() {
+    local algo_name="$1"
+    local test_mode="${2:-all}"
+    local test_file="$BUILD_DIR/test_${algo_name,,}"
+    
+    if [ -f "$test_file" ]; then
+        print_info "运行 $algo_name 测试 (模式: $test_mode)..."
+        "./$test_file" "$test_mode"
+        return $?
+    else
+        print_error "$algo_name 测试程序不存在: $test_file"
+        return 1
+    fi
+}
+
+# ============================================================================
+# 主脚本逻辑
+# ============================================================================
+
+# 显示帮助信息 (整合了所有选项)
+show_help() {
+    echo "GPU算法竞赛统一编译和运行脚本"
+    echo "用法: $0 [选项]"
+    echo ""
+    echo "选项:"
+    echo "  --help              显示帮助信息"
+    echo "  --build-only        仅编译所有算法，不运行测试"
+    echo "  --run_reduce [MODE] 编译并运行ReduceSum算法测试 (MODE: correctness|performance|all, 默认all)"
+    echo "  --run_sort [MODE]   编译并运行SortPair算法测试 (MODE: correctness|performance|all, 默认all)"
+    echo "  --run_topk [MODE]   编译并运行TopkPair算法测试 (MODE: correctness|performance|all, 默认all)"
+    echo ""
+    echo "示例:"
+    echo "  $0                  # 编译并运行所有测试（默认行为）"
+    echo "  $0 --build-only     # 仅编译所有算法"
+    echo "  $0 --run_sort performance # 编译并运行SortPair性能测试"
+    echo ""
+}
+
+# 解析命令行参数
+RUN_MODE="run_all"  # 默认为编译并运行所有测试
+ALGO_TO_RUN=""      # 记录要运行的单个算法
+SINGLE_ALGO_TEST_MODE="all" # 单个算法的测试模式
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --help)
+            show_help
+            exit 0
+            ;;
+        --build-only)
+            RUN_MODE="build_only"
+            shift
+            ;;
+        --run_reduce)
+            RUN_MODE="run_single"
+            ALGO_TO_RUN="ReduceSum"
+            if [[ -n "$2" && "$2" != --* ]]; then
+                SINGLE_ALGO_TEST_MODE="$2"
+                shift
+            fi
+            shift
+            ;;
+        --run_sort)
+            RUN_MODE="run_single"
+            ALGO_TO_RUN="SortPair"
+            if [[ -n "$2" && "$2" != --* ]]; then
+                SINGLE_ALGO_TEST_MODE="$2"
+                shift
+            fi
+            shift
+            ;;
+        --run_topk)
+            RUN_MODE="run_single"
+            ALGO_TO_RUN="TopkPair"
+            if [[ -n "$2" && "$2" != --* ]]; then
+                SINGLE_ALGO_TEST_MODE="$2"
+                shift
+            fi
+            shift
+            ;;
+        *)
+            print_error "未知选项: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+done
+
+if [ "$RUN_MODE" = "build_only" ]; then
+    print_info "开始编译所有算法..."
+else
+    print_info "开始编译并运行所有算法..."
+fi
+print_info "工作目录: $(pwd)"
+print_info "编译时间: $(date '+%Y-%m-%d %H:%M:%S')"
+show_build_config
+
+# 清理构建目录
+if [ -d "$BUILD_DIR" ]; then
+    print_info "清理现有构建目录: $BUILD_DIR"
+    rm -rf "$BUILD_DIR"
+fi
+
+# 核心逻辑：根据 RUN_MODE 执行操作
+case "$RUN_MODE" in
+    "build_only")
+        print_info "编译所有算法..."
+        
+        # 直接调用 compile_algorithm 函数
+        print_info "[1/3] 编译ReduceSum..."
+        if ! compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then
+            print_error "ReduceSum编译失败"
+            exit 1
+        fi
+        
+        print_info "[2/3] 编译SortPair..."
+        if ! compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then
+            print_error "SortPair编译失败"
+            exit 1
+        fi
+        
+        print_info "[3/3] 编译TopkPair..."
+        if ! compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then
+            print_error "TopkPair编译失败"
+            exit 1
+        fi
+        
+        print_success "所有算法编译完成!"
+        echo ""
+        echo "可执行文件:"
+        echo "  $BUILD_DIR/test_reducesum    - ReduceSum算法测试"
+        echo "  $BUILD_DIR/test_sortpair     - SortPair算法测试"
+        echo "  $BUILD_DIR/test_topkpair     - TopkPair算法测试"
+        echo ""
+        echo "使用方法:"
+        echo "  ./$BUILD_DIR/test_reducesum [correctness|performance|all]"
+        echo "  ./$BUILD_DIR/test_sortpair [correctness|performance|all]"
+        echo "  ./$BUILD_DIR/test_topkpair [correctness|performance|all]"
+        ;;
+        
+    "run_all")
+        print_info "编译并运行所有算法测试..."
+        
+        # 直接调用 compile_algorithm 和 run_single_test 函数
+        print_info "[1/3] ReduceSum..."
+        if compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then
+            run_single_test "ReduceSum" "all"
+        else
+            exit 1
+        fi
+        
+        print_info "[2/3] SortPair..."
+        if compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then
+            run_single_test "SortPair" "all"
+        else
+            exit 1
+        fi
+        
+        print_info "[3/3] TopkPair..."
+        if compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then
+            run_single_test "TopkPair" "all"
+        else
+            exit 1
+        fi
+        
+        print_success "所有测试完成!"
+        ;;
+        
+    "run_single")
+        print_info "编译并运行 ${ALGO_TO_RUN} 测试 (模式: ${SINGLE_ALGO_TEST_MODE})..."
+        local source_file_name=""
+        case "$ALGO_TO_RUN" in
+            "ReduceSum") source_file_name="reduce_sum_algorithm.maca" ;;
+            "SortPair") source_file_name="sort_pair_algorithm.maca" ;;
+            "TopkPair") source_file_name="topk_pair_algorithm.maca" ;;
+        esac
+
+        if compile_algorithm "$ALGO_TO_RUN" "$source_file_name"; then
+            run_single_test "$ALGO_TO_RUN" "$SINGLE_ALGO_TEST_MODE"
+        else
+            exit 1
+        fi
+        ;;
+esac
--- a/cp_template/utils/performance_utils.h
+++ b/cp_template/utils/performance_utils.h
@@ -0,0 +1,114 @@
+#pragma once
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+// ============================================================================
+// 性能计算和显示工具
+// ============================================================================
+
+class PerformanceCalculator {
+public:
+    // ReduceSum性能计算
+    struct ReduceSumMetrics {
+        double throughput_gps;  // G elements/s
+    };
+
+    static ReduceSumMetrics calculateReduceSum(int size, float time_ms) {
+        ReduceSumMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+
+    // SortPair性能计算
+    struct SortPairMetrics {
+        double throughput_gps;  // G elements/s  
+    };
+
+    static SortPairMetrics calculateSortPair(int size, float time_ms) {
+        SortPairMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+
+    // TopkPair性能计算
+    struct TopkPairMetrics {
+        double throughput_gps;  // G elements/s
+    };
+
+    static TopkPairMetrics calculateTopkPair(int size, int k, float time_ms) {
+        TopkPairMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+};
+
+// ============================================================================
+// 性能显示工具
+// ============================================================================
+
+class PerformanceDisplay {
+public:
+    // 显示ReduceSum性能表头
+    static void printReduceSumHeader() {
+        std::cout << "\nReduceSum 性能测试..." << std::endl;
+        std::cout << "数据类型: float -> float" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+        std::cout << std::setw(12) << "数据规模" << std::setw(15) << "时间(ms)" 
+                  << std::setw(20) << "吞吐量(G/s)" << std::endl;
+        std::cout << std::string(47, '-') << std::endl;
+    }
+
+    // 显示SortPair性能表头
+    static void printSortPairHeader() {
+        std::cout << "\nSortPair 性能测试..." << std::endl;
+        std::cout << "数据类型: <float, uint32_t>" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+        std::cout << std::setw(12) << "数据规模" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" 
+                  << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl;
+        std::cout << std::string(78, '-') << std::endl;
+    }
+
+    // 显示TopkPair性能表头  
+    static void printTopkPairHeader() {
+        std::cout << "\nTopkPair 性能测试..." << std::endl;
+        std::cout << "数据类型: <float, uint32_t>" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+    }
+
+    static void printTopkPairDataHeader() {
+        std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" 
+                  << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl;
+        std::cout << std::string(74, '-') << std::endl;
+    }
+
+    // 显示性能数据行
+    static void printReduceSumData(int size, float time_ms, const PerformanceCalculator::ReduceSumMetrics& metrics) {
+        std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << time_ms << std::setw(20) << std::setprecision(3) << metrics.throughput_gps << std::endl;
+    }
+
+    static void printSortPairData(int size, float asc_time, float desc_time, 
+                                  const PerformanceCalculator::SortPairMetrics& asc_metrics,
+                                  const PerformanceCalculator::SortPairMetrics& desc_metrics) {
+        std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) 
+                  << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl;
+    }
+
+    static void printTopkPairData(int k, float asc_time, float desc_time,
+                                  const PerformanceCalculator::TopkPairMetrics& asc_metrics,
+                                  const PerformanceCalculator::TopkPairMetrics& desc_metrics) {
+        std::cout << std::setw(8) << k << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) 
+                  << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl;
+    }
+
+    // 显示性能文件保存消息
+    static void printSavedMessage(const std::string& filename) {
+        std::cout << "\n性能结果已保存到: " << filename << std::endl;
+    }
+};
--- a/cp_template/utils/test_utils.h
+++ b/cp_template/utils/test_utils.h
@@ -0,0 +1,234 @@
+#pragma once
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <mc_runtime.h>
+#include <maca_fp16.h>
+#include <iostream>
+#include <chrono>
+#include <cmath>
+
+// 引入模块化头文件
+#include "yaml_reporter.h"
+#include "performance_utils.h"
+
+// ============================================================================
+// 测试配置常量
+// ============================================================================
+#ifndef RUN_FULL_TEST
+const int TEST_SIZES[] = {1000000, 134217728}; // 1M, 128M, 512M, 1G
+#else
+const int TEST_SIZES[] = {1000000, 134217728, 536870912, 1073741824}; // 1M, 128M, 512M, 1G
+#endif
+
+const int NUM_TEST_SIZES = sizeof(TEST_SIZES) / sizeof(TEST_SIZES[0]);
+
+// 性能测试重复次数
+constexpr int WARMUP_ITERATIONS = 5;
+constexpr int BENCHMARK_ITERATIONS = 10;
+
+
+// ============================================================================
+// 错误检查宏
+// ============================================================================
+#define MACA_CHECK(call) \
+    do { \
+        mcError_t error = call; \
+        if (error != mcSuccess) { \
+            std::cerr << "MACA error at " << __FILE__ << ":" << __LINE__ \
+                      << " - " << mcGetErrorString(error) << std::endl; \
+            exit(1); \
+        } \
+    } while(0)
+
+// ============================================================================
+// 测试数据生成器
+// ============================================================================
+class TestDataGenerator {
+private:
+    std::mt19937 rng;
+    
+public:
+    TestDataGenerator(uint32_t seed = 42) : rng(seed) {}
+    
+    // 生成随机float数组
+    std::vector<float> generateRandomFloats(int size, float min_val = -1000.0f, float max_val = 1000.0f) {
+        std::vector<float> data(size);
+        std::uniform_real_distribution<float> dist(min_val, max_val);
+        for (int i = 0; i < size; i++) {
+            data[i] = dist(rng);
+        }
+        return data;
+    }
+    
+    // 生成随机half数组
+    std::vector<half> generateRandomHalfs(int size, float min_val = -100.0f, float max_val = 100.0f) {
+        std::vector<half> data(size);
+        std::uniform_real_distribution<float> dist(min_val, max_val);
+        for (int i = 0; i < size; i++) {
+            data[i] = __float2half(dist(rng));
+        }
+        return data;
+    }
+    
+    // 生成随机uint32_t数组
+    std::vector<uint32_t> generateRandomUint32(int size) {
+        std::vector<uint32_t> data(size);
+        for (int i = 0; i < size; i++) {
+            data[i] = static_cast<uint32_t>(i); // 使用索引作为值，便于验证稳定排序
+        }
+        return data;
+    }
+    
+    // 生成随机int64_t数组
+    std::vector<int64_t> generateRandomInt64(int size) {
+        std::vector<int64_t> data(size);
+        for (int i = 0; i < size; i++) {
+            data[i] = static_cast<int64_t>(i);
+        }
+        return data;
+    }
+    
+    // 生成包含NaN和Inf的测试数据 (half版本)
+    std::vector<half> generateSpecialHalfs(int size) {
+        std::vector<half> data = generateRandomHalfs(size, -10.0f, 10.0f);
+        if (size > 100) {
+            data[10] = __float2half(NAN);
+            data[20] = __float2half(INFINITY);
+            data[30] = __float2half(-INFINITY);
+        }
+        return data;
+    }
+    
+    // 生成包含NaN和Inf的测试数据 (float版本)
+    std::vector<float> generateSpecialFloats(int size) {
+        std::vector<float> data = generateRandomFloats(size, -10.0f, 10.0f);
+        if (size > 100) {
+            data[10] = NAN;
+            data[20] = INFINITY;
+            data[30] = -INFINITY;
+        }
+        return data;
+    }
+};
+
+// ============================================================================
+// 性能测试工具
+// ============================================================================
+class PerformanceMeter {
+private:
+    mcEvent_t start, stop;
+    
+public:
+    PerformanceMeter() {
+        MACA_CHECK(mcEventCreate(&start));
+        MACA_CHECK(mcEventCreate(&stop));
+    }
+    
+    ~PerformanceMeter() {
+        mcEventDestroy(start);
+        mcEventDestroy(stop);
+    }
+    
+    void startTiming() {
+        MACA_CHECK(mcEventRecord(start));
+    }
+    
+    float stopTiming() {
+        MACA_CHECK(mcEventRecord(stop));
+        MACA_CHECK(mcEventSynchronize(stop));
+        float milliseconds = 0;
+        MACA_CHECK(mcEventElapsedTime(&milliseconds, start, stop));
+        return milliseconds;
+    }
+};
+
+// ============================================================================
+// 正确性验证工具
+// ============================================================================
+template<typename T>
+bool compareArrays(const std::vector<T>& a, const std::vector<T>& b, double tolerance = 1e-6) {
+    if (a.size() != b.size()) return false;
+    
+    for (size_t i = 0; i < a.size(); i++) {
+        if constexpr (std::is_same_v<T, half>) {
+            float fa = __half2float(a[i]);
+            float fb = __half2float(b[i]);
+            if (std::isnan(fa) && std::isnan(fb)) continue;
+            if (std::isinf(fa) && std::isinf(fb) && (fa > 0) == (fb > 0)) continue;
+            if (std::abs(fa - fb) > tolerance) return false;
+        } else if constexpr (std::is_floating_point_v<T>) {
+            if (std::isnan(a[i]) && std::isnan(b[i])) continue;
+            if (std::isinf(a[i]) && std::isinf(b[i]) && (a[i] > 0) == (b[i] > 0)) continue;
+            if (std::abs(a[i] - b[i]) > tolerance) return false;
+        } else {
+            if (a[i] != b[i]) return false;
+        }
+    }
+    return true;
+}
+
+// CPU参考实现 - 稳定排序
+template<typename KeyType, typename ValueType>
+void cpuSortPair(std::vector<KeyType>& keys, std::vector<ValueType>& values, bool descending) {
+    std::vector<std::pair<KeyType, ValueType>> pairs;
+    for (size_t i = 0; i < keys.size(); i++) {
+        pairs.emplace_back(keys[i], values[i]);
+    }
+    
+    if (descending) {
+        std::stable_sort(pairs.begin(), pairs.end(), 
+            [](const auto& a, const auto& b) { return a.first > b.first; });
+    } else {
+        std::stable_sort(pairs.begin(), pairs.end());
+    }
+    
+    for (size_t i = 0; i < pairs.size(); i++) {
+        keys[i] = pairs[i].first;
+        values[i] = pairs[i].second;
+    }
+}
+
+// CPU参考实现 - TopK
+template<typename KeyType, typename ValueType>
+void cpuTopkPair(const std::vector<KeyType>& keys_in, const std::vector<ValueType>& values_in,
+                 std::vector<KeyType>& keys_out, std::vector<ValueType>& values_out,
+                 int k, bool descending) {
+    std::vector<std::pair<KeyType, ValueType>> pairs;
+    for (size_t i = 0; i < keys_in.size(); i++) {
+        pairs.emplace_back(keys_in[i], values_in[i]);
+    }
+    
+    if (descending) {
+        std::stable_sort(pairs.begin(), pairs.end(), 
+            [](const auto& a, const auto& b) { return a.first > b.first; });
+    } else {
+        std::stable_sort(pairs.begin(), pairs.end());
+    }
+    
+    keys_out.resize(k);
+    values_out.resize(k);
+    for (int i = 0; i < k; i++) {
+        keys_out[i] = pairs[i].first;
+        values_out[i] = pairs[i].second;
+    }
+}
+
+// CPU参考实现 - ReduceSum (使用double精度)
+template<typename InputT>
+double cpuReduceSum(const std::vector<InputT>& data, double init_value) {
+    double sum = init_value;
+    for (const auto& val : data) {
+        if constexpr (std::is_same_v<InputT, half>) {
+            float f_val = __half2float(val);
+            if (!std::isnan(f_val)) {
+                sum += static_cast<double>(f_val);
+            }
+        } else {
+            if (!std::isnan(val)) {
+                sum += static_cast<double>(val);
+            }
+        }
+    }
+    return sum;
+}
--- a/cp_template/utils/yaml_reporter.h
+++ b/cp_template/utils/yaml_reporter.h
@@ -0,0 +1,154 @@
+#pragma once
+#include <fstream>
+#include <vector>
+#include <map>
+#include <string>
+#include <chrono>
+#include <iomanip>
+#include <sstream>
+
+// ============================================================================
+// YAML性能报告生成器
+// ============================================================================
+
+class YAMLPerformanceReporter {
+public:
+    struct PerformanceData {
+        std::string algorithm;
+        std::string input_type;
+        std::string output_type;
+        std::string key_type;
+        std::string value_type;
+        std::vector<std::map<std::string, std::string>> metrics;
+    };
+
+    // 创建性能数据条目
+    static std::map<std::string, std::string> createEntry() {
+        return std::map<std::string, std::string>();
+    }
+
+    // 生成ReduceSum性能YAML
+    static void generateReduceSumYAML(const std::vector<std::map<std::string, std::string>>& perf_data, 
+                                      const std::string& filename = "reduce_sum_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "ReduceSum算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"ReduceSum\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  input: \"float\"\n";
+        yaml_file << "  output: \"float\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    time_ms: " << formatFloat(data.at("time_ms")) << "\n";
+            yaml_file << "    throughput_gps: " << formatFloat(data.at("throughput_gps")) << "\n";
+            yaml_file << "    data_type: \"" << data.at("data_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+    // 生成SortPair性能YAML
+    static void generateSortPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data,
+                                     const std::string& filename = "sort_pair_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "SortPair算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"SortPair\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  key_type: \"float\"\n";
+        yaml_file << "  value_type: \"uint32_t\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    ascending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n";
+            yaml_file << "    descending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n";
+            yaml_file << "    key_type: \"" << data.at("key_type") << "\"\n";
+            yaml_file << "    value_type: \"" << data.at("value_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+    // 生成TopkPair性能YAML
+    static void generateTopkPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data,
+                                     const std::string& filename = "topk_pair_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "TopkPair算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"TopkPair\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  key_type: \"float\"\n";
+        yaml_file << "  value_type: \"uint32_t\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    k_value: " << data.at("k_value") << "\n";
+            yaml_file << "    ascending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n";
+            yaml_file << "    descending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n";
+            yaml_file << "    key_type: \"" << data.at("key_type") << "\"\n";
+            yaml_file << "    value_type: \"" << data.at("value_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+private:
+    // 写入YAML文件头部
+    static void writeHeader(std::ofstream& file, const std::string& title) {
+        file << "# " << title << "\n";
+        file << "# 生成时间: ";
+        
+        auto now = std::chrono::system_clock::now();
+        auto time_t = std::chrono::system_clock::to_time_t(now);
+        file << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S");
+        file << "\n\n";
+    }
+
+    // 格式化浮点数
+    static std::string formatFloat(const std::string& value) {
+        try {
+            double d = std::stod(value);
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(6) << d;
+            return oss.str();
+        } catch (...) {
+            return value;
+        }
+    }
+};