update example

2025-08-29 10:06:36 +08:00
parent 0c5e03345a
commit 173437c2d0
7 changed files with 2 additions and 2 deletions
--- a/example/cp_template/performance_utils.h
+++ b/example/cp_template/performance_utils.h
@@ -0,0 +1,114 @@
+#pragma once
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+// ============================================================================
+// 性能计算和显示工具
+// ============================================================================
+
+class PerformanceCalculator {
+public:
+    // ReduceSum性能计算
+    struct ReduceSumMetrics {
+        double throughput_gps;  // G elements/s
+    };
+
+    static ReduceSumMetrics calculateReduceSum(int size, float time_ms) {
+        ReduceSumMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+
+    // SortPair性能计算
+    struct SortPairMetrics {
+        double throughput_gps;  // G elements/s  
+    };
+
+    static SortPairMetrics calculateSortPair(int size, float time_ms) {
+        SortPairMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+
+    // TopkPair性能计算
+    struct TopkPairMetrics {
+        double throughput_gps;  // G elements/s
+    };
+
+    static TopkPairMetrics calculateTopkPair(int size, int k, float time_ms) {
+        TopkPairMetrics metrics;
+        metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0);
+        return metrics;
+    }
+};
+
+// ============================================================================
+// 性能显示工具
+// ============================================================================
+
+class PerformanceDisplay {
+public:
+    // 显示ReduceSum性能表头
+    static void printReduceSumHeader() {
+        std::cout << "\nReduceSum 性能测试..." << std::endl;
+        std::cout << "数据类型: float -> float" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+        std::cout << std::setw(12) << "数据规模" << std::setw(15) << "时间(ms)" 
+                  << std::setw(20) << "吞吐量(G/s)" << std::endl;
+        std::cout << std::string(47, '-') << std::endl;
+    }
+
+    // 显示SortPair性能表头
+    static void printSortPairHeader() {
+        std::cout << "\nSortPair 性能测试..." << std::endl;
+        std::cout << "数据类型: <float, uint32_t>" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+        std::cout << std::setw(12) << "数据规模" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" 
+                  << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl;
+        std::cout << std::string(78, '-') << std::endl;
+    }
+
+    // 显示TopkPair性能表头  
+    static void printTopkPairHeader() {
+        std::cout << "\nTopkPair 性能测试..." << std::endl;
+        std::cout << "数据类型: <float, uint32_t>" << std::endl;
+        std::cout << "计算公式:" << std::endl;
+        std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+    }
+
+    static void printTopkPairDataHeader() {
+        std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" 
+                  << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl;
+        std::cout << std::string(74, '-') << std::endl;
+    }
+
+    // 显示性能数据行
+    static void printReduceSumData(int size, float time_ms, const PerformanceCalculator::ReduceSumMetrics& metrics) {
+        std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << time_ms << std::setw(20) << std::setprecision(3) << metrics.throughput_gps << std::endl;
+    }
+
+    static void printSortPairData(int size, float asc_time, float desc_time, 
+                                  const PerformanceCalculator::SortPairMetrics& asc_metrics,
+                                  const PerformanceCalculator::SortPairMetrics& desc_metrics) {
+        std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) 
+                  << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl;
+    }
+
+    static void printTopkPairData(int k, float asc_time, float desc_time,
+                                  const PerformanceCalculator::TopkPairMetrics& asc_metrics,
+                                  const PerformanceCalculator::TopkPairMetrics& desc_metrics) {
+        std::cout << std::setw(8) << k << std::setw(15) << std::fixed << std::setprecision(3) 
+                  << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) 
+                  << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl;
+    }
+
+    // 显示性能文件保存消息
+    static void printSavedMessage(const std::string& filename) {
+        std::cout << "\n性能结果已保存到: " << filename << std::endl;
+    }
+};
--- a/example/cp_template/reduce_sum_algorithm.maca
+++ b/example/cp_template/reduce_sum_algorithm.maca
@@ -0,0 +1,277 @@
+#include "test_utils.h"
+#include "performance_utils.h"
+#include "yaml_reporter.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+
+// ============================================================================
+// 实现标记宏 - 参赛者修改实现时请将此宏设为0
+// ============================================================================
+#ifndef USE_DEFAULT_REF_IMPL
+#define USE_DEFAULT_REF_IMPL 1  // 1=默认实现, 0=参赛者自定义实现
+#endif
+
+#if USE_DEFAULT_REF_IMPL
+#include <thrust/reduce.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#endif
+
+// 误差容忍度
+constexpr double REDUCE_ERROR_TOLERANCE = 0.005; // 0.5%
+
+// ============================================================================
+// ReduceSum算法实现接口
+// 参赛者需要替换Thrust实现为自己的高性能kernel
+// ============================================================================
+
+template <typename InputT = float, typename OutputT = float>
+class ReduceSumAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) {
+        
+#if !USE_DEFAULT_REF_IMPL
+        // ========================================
+        // 参赛者自定义实现区域
+        // ========================================
+        
+        // TODO: 参赛者在此实现自己的高性能归约算法
+        
+        // 示例：参赛者可以调用1个或多个自定义kernel
+        // blockReduceKernel<<<grid, block>>>(d_in, temp_results, num_items, init_value);
+        // finalReduceKernel<<<1, block>>>(temp_results, d_out, grid.x);
+#else
+        // ========================================
+        // 默认基准实现
+        // ========================================
+        auto input_ptr = thrust::device_pointer_cast(d_in);
+        auto output_ptr = thrust::device_pointer_cast(d_out);
+        
+        // 直接使用thrust::reduce进行归约
+        *output_ptr = thrust::reduce(
+            thrust::device,
+            input_ptr,
+            input_ptr + num_items,
+            static_cast<OutputT>(init_value)
+        );     
+#endif
+    }
+    
+    // 获取当前实现状态
+    static const char* getImplementationStatus() {
+#if USE_DEFAULT_REF_IMPL
+        return "DEFAULT_REF_IMPL";
+#else
+        return "CUSTOM_IMPL";
+#endif
+    }
+    
+private:
+    // 参赛者可以在这里添加辅助函数和成员变量
+    // 例如：中间结果缓冲区、多阶段归约等
+};
+
+// ============================================================================
+// 测试和性能评估
+// ============================================================================
+
+bool testCorrectness() {
+    std::cout << "ReduceSum 正确性测试..." << std::endl;
+    TestDataGenerator generator;
+    ReduceSumAlgorithm<float, float> algorithm;
+    
+    bool allPassed = true;
+    
+    // 测试不同数据规模
+    for (int i = 0; i < NUM_TEST_SIZES && i < 2; i++) { // 限制测试规模
+        int size = std::min(TEST_SIZES[i], 10000);
+        std::cout << "  测试规模: " << size << std::endl;
+        
+        // 测试普通数据
+        {
+            auto data = generator.generateRandomFloats(size, -10.0f, 10.0f);
+            float init_value = 1.0f;
+            
+            // CPU参考计算
+            double cpu_result = cpuReduceSum(data, static_cast<double>(init_value));
+            
+            // GPU计算
+            float *d_in;
+            float *d_out;
+            MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
+            MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
+            
+            MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
+            
+            algorithm.reduce(d_in, d_out, size, init_value);
+            
+            float gpu_result;
+            MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost));
+            
+            // 验证误差
+            double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result);
+            if (relative_error > REDUCE_ERROR_TOLERANCE) {
+                std::cout << "    失败: 误差过大 " << relative_error << std::endl;
+                allPassed = false;
+            } else {
+                std::cout << "    通过 (误差: " << relative_error << ")" << std::endl;
+            }
+            
+            mcFree(d_in);
+            mcFree(d_out);
+        }
+        
+        // 测试特殊值 (NaN, Inf)
+        if (size > 100) {
+            std::cout << "  测试特殊值..." << std::endl;
+            auto data = generator.generateSpecialFloats(size);
+            float init_value = 0.0f;
+            
+            double cpu_result = cpuReduceSum(data, static_cast<double>(init_value));
+            
+            float *d_in;
+            float *d_out;
+            MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
+            MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
+            
+            MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
+            
+            algorithm.reduce(d_in, d_out, size, init_value);
+            
+            float gpu_result;
+            MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost));
+            
+            // 对于包含特殊值的情况，检查是否正确处理
+            if (std::isfinite(cpu_result) && std::isfinite(gpu_result)) {
+                double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result);
+                if (relative_error > REDUCE_ERROR_TOLERANCE) {
+                    std::cout << "    失败: 特殊值处理错误" << std::endl;
+                    allPassed = false;
+                } else {
+                    std::cout << "    通过 (特殊值处理)" << std::endl;
+                }
+            } else {
+                std::cout << "    通过 (特殊值结果)" << std::endl;
+            }
+            
+            mcFree(d_in);
+            mcFree(d_out);
+        }
+    }
+    
+    return allPassed;
+}
+
+void benchmarkPerformance() {
+    PerformanceDisplay::printReduceSumHeader();
+    
+    TestDataGenerator generator;
+    PerformanceMeter meter;
+    ReduceSumAlgorithm<float, float> algorithm;
+    
+    const int WARMUP_ITERATIONS = 5;
+    const int BENCHMARK_ITERATIONS = 10;
+    
+    // 用于YAML报告的数据收集
+    std::vector<std::map<std::string, std::string>> perf_data;
+    
+    for (int i = 0; i < NUM_TEST_SIZES; i++) {
+        int size = TEST_SIZES[i];
+        
+        // 生成测试数据
+        auto data = generator.generateRandomFloats(size);
+        float init_value = 0.0f;
+        
+        // 分配GPU内存
+        float *d_in;
+        float *d_out;
+        MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
+        MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
+        
+        MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
+        
+        // Warmup阶段
+        for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) {
+            algorithm.reduce(d_in, d_out, size, init_value);
+        }
+        
+        // 正式测试阶段
+        float total_time = 0;
+        for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
+            meter.startTiming();
+            algorithm.reduce(d_in, d_out, size, init_value);
+            total_time += meter.stopTiming();
+        }
+        
+        float avg_time = total_time / BENCHMARK_ITERATIONS;
+        
+        // 计算性能指标
+        auto metrics = PerformanceCalculator::calculateReduceSum(size, avg_time);
+        
+        // 显示性能数据
+        PerformanceDisplay::printReduceSumData(size, avg_time, metrics);
+        
+        // 收集YAML报告数据
+        auto entry = YAMLPerformanceReporter::createEntry();
+        entry["data_size"] = std::to_string(size);
+        entry["time_ms"] = std::to_string(avg_time);
+        entry["throughput_gps"] = std::to_string(metrics.throughput_gps);
+        entry["data_type"] = "float";
+        perf_data.push_back(entry);
+        
+        mcFree(d_in);
+        mcFree(d_out);
+    }
+    
+    // 生成YAML性能报告
+    YAMLPerformanceReporter::generateReduceSumYAML(perf_data, "reduce_sum_performance.yaml");
+    PerformanceDisplay::printSavedMessage("reduce_sum_performance.yaml");
+}
+
+// ============================================================================
+// 主函数
+// ============================================================================
+int main(int argc, char* argv[]) {
+    std::cout << "=== ReduceSum 算法测试 ===" << std::endl;
+    
+    // 检查参数
+    std::string mode = "all";
+    if (argc > 1) {
+        mode = argv[1];
+    }
+    
+    bool correctness_passed = true;
+    bool performance_completed = true;
+    
+    try {
+        if (mode == "correctness" || mode == "all") {
+            correctness_passed = testCorrectness();
+        }
+        
+        if (mode == "performance" || mode == "all") {
+            if (correctness_passed || mode == "performance") {
+                benchmarkPerformance();
+            } else {
+                std::cout << "跳过性能测试，因为正确性测试未通过" << std::endl;
+                performance_completed = false;
+            }
+        }
+        
+        std::cout << "\n=== 测试完成 ===" << std::endl;
+        std::cout << "实现状态: " << ReduceSumAlgorithm<float, float>::getImplementationStatus() << std::endl;
+        if (mode == "all") {
+            std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl;
+            std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl;
+        }
+        
+        return correctness_passed ? 0 : 1;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "测试出错: " << e.what() << std::endl;
+        return 1;
+    }
+}
--- a/example/cp_template/sort_pair_algorithm.maca
+++ b/example/cp_template/sort_pair_algorithm.maca
@@ -0,0 +1,275 @@
+#include "test_utils.h"
+#include "performance_utils.h"
+#include "yaml_reporter.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+// ============================================================================
+// 实现标记宏 - 参赛者修改实现时请将此宏设为0
+// ============================================================================
+#ifndef USE_DEFAULT_REF_IMPL
+#define USE_DEFAULT_REF_IMPL 1  // 1=默认实现, 0=参赛者自定义实现
+#endif
+
+#if USE_DEFAULT_REF_IMPL
+#include <thrust/sort.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#endif
+
+// ============================================================================
+// SortPair算法实现接口
+// 参赛者需要替换Thrust实现为自己的高性能kernel
+// ============================================================================
+
+template <typename KeyType, typename ValueType>
+class SortPairAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void sort(const KeyType* d_keys_in, KeyType* d_keys_out,
+              const ValueType* d_values_in, ValueType* d_values_out,
+              int num_items, bool descending) {
+        
+#if !USE_DEFAULT_REF_IMPL
+        // ========================================
+        // 参赛者自定义实现区域
+        // ========================================
+        
+        // TODO: 参赛者在此实现自己的高性能排序算法
+        
+        // 示例：参赛者可以调用1个或多个自定义kernel
+        // preprocessKernel<<<grid, block>>>(d_keys_in, d_values_in, num_items);
+        // mainSortKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items, descending);
+        // postprocessKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items);
+#else
+        // ========================================
+        // 默认基准实现
+        // ========================================
+        
+        MACA_CHECK(mcMemcpy(d_keys_out, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice));
+        MACA_CHECK(mcMemcpy(d_values_out, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice));
+        
+        auto key_ptr = thrust::device_pointer_cast(d_keys_out);
+        auto value_ptr = thrust::device_pointer_cast(d_values_out);
+        
+        if (descending) {
+            thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>());
+        } else {
+            thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>());
+        }
+#endif
+    }
+    
+    // 获取当前实现状态
+    static const char* getImplementationStatus() {
+#if USE_DEFAULT_REF_IMPL
+        return "DEFAULT_REF_IMPL";
+#else
+        return "CUSTOM_IMPL";
+#endif
+    }
+    
+private:
+    // 参赛者可以在这里添加辅助函数和成员变量
+    // 例如：临时缓冲区、多个kernel函数、流等
+};
+
+// ============================================================================
+// 测试和性能评估
+// ============================================================================
+
+bool testCorrectness() {
+    std::cout << "SortPair 正确性测试..." << std::endl;
+    TestDataGenerator generator;
+    SortPairAlgorithm<float, uint32_t> algorithm;
+    
+    // 测试小规模数据
+    int size = 10000;
+    auto keys = generator.generateRandomFloats(size);
+    auto values = generator.generateRandomUint32(size);
+    
+    // 分配GPU内存
+    float *d_keys_in, *d_keys_out;
+    uint32_t *d_values_in, *d_values_out;
+    
+    MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float)));
+    MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float)));
+    MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t)));
+    MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t)));
+    
+    MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice));
+    MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice));
+    
+    // 测试升序和降序
+    bool allPassed = true;
+    for (bool descending : {false, true}) {
+        std::cout << "  " << (descending ? "降序" : "升序") << " 测试..." << std::endl;
+        
+        // CPU参考结果
+        auto cpu_keys = keys;
+        auto cpu_values = values;
+        cpuSortPair(cpu_keys, cpu_values, descending);
+        
+        // GPU算法结果
+        algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending);
+        
+        // 获取结果
+        std::vector<float> gpu_keys(size);
+        std::vector<uint32_t> gpu_values(size);
+        MACA_CHECK(mcMemcpy(gpu_keys.data(), d_keys_out, size * sizeof(float), mcMemcpyDeviceToHost));
+        MACA_CHECK(mcMemcpy(gpu_values.data(), d_values_out, size * sizeof(uint32_t), mcMemcpyDeviceToHost));
+        
+        // 验证结果
+        bool keysMatch = compareArrays(cpu_keys, gpu_keys, 1e-5);
+        bool valuesMatch = compareArrays(cpu_values, gpu_values);
+        
+        if (!keysMatch || !valuesMatch) {
+            std::cout << "    失败: 结果不匹配" << std::endl;
+            allPassed = false;
+        } else {
+            std::cout << "    通过" << std::endl;
+        }
+    }
+    
+    // 清理内存
+    mcFree(d_keys_in);
+    mcFree(d_keys_out);
+    mcFree(d_values_in);
+    mcFree(d_values_out);
+    
+    return allPassed;
+}
+
+void benchmarkPerformance() {
+    PerformanceDisplay::printSortPairHeader();
+    
+    TestDataGenerator generator;
+    PerformanceMeter meter;
+    SortPairAlgorithm<float, uint32_t> algorithm;
+    
+    const int WARMUP_ITERATIONS = 5;
+    const int BENCHMARK_ITERATIONS = 10;
+    
+    // 用于YAML报告的数据收集
+    std::vector<std::map<std::string, std::string>> perf_data;
+    
+    for (int i = 0; i < NUM_TEST_SIZES; i++) {
+        int size = TEST_SIZES[i];
+        
+        // 生成测试数据
+        auto keys = generator.generateRandomFloats(size);
+        auto values = generator.generateRandomUint32(size);
+        
+        // 分配GPU内存
+        float *d_keys_in, *d_keys_out;
+        uint32_t *d_values_in, *d_values_out;
+        
+        MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float)));
+        MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float)));
+        MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t)));
+        MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t)));
+        
+        MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice));
+        MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice));
+        
+        float asc_time = 0, desc_time = 0;
+        
+        // 测试升序和降序
+        for (bool descending : {false, true}) {
+            // Warmup阶段
+            for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) {
+                algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending);
+            }
+            
+            // 正式测试阶段
+            float total_time = 0;
+            for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
+                meter.startTiming();
+                algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending);
+                total_time += meter.stopTiming();
+            }
+            
+            float avg_time = total_time / BENCHMARK_ITERATIONS;
+            if (descending) {
+                desc_time = avg_time;
+            } else {
+                asc_time = avg_time;
+            }
+        }
+        
+        // 计算性能指标
+        auto asc_metrics = PerformanceCalculator::calculateSortPair(size, asc_time);
+        auto desc_metrics = PerformanceCalculator::calculateSortPair(size, desc_time);
+        
+        // 显示性能数据
+        PerformanceDisplay::printSortPairData(size, asc_time, desc_time, asc_metrics, desc_metrics);
+        
+        // 收集YAML报告数据
+        auto entry = YAMLPerformanceReporter::createEntry();
+        entry["data_size"] = std::to_string(size);
+        entry["asc_time_ms"] = std::to_string(asc_time);
+        entry["desc_time_ms"] = std::to_string(desc_time);
+        entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps);
+        entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps);
+        entry["key_type"] = "float";
+        entry["value_type"] = "uint32_t";
+        perf_data.push_back(entry);
+        
+        // 清理内存
+        mcFree(d_keys_in);
+        mcFree(d_keys_out);
+        mcFree(d_values_in);
+        mcFree(d_values_out);
+    }
+    
+    // 生成YAML性能报告
+    YAMLPerformanceReporter::generateSortPairYAML(perf_data, "sort_pair_performance.yaml");
+    PerformanceDisplay::printSavedMessage("sort_pair_performance.yaml");
+}
+
+// ============================================================================
+// 主函数
+// ============================================================================
+int main(int argc, char* argv[]) {
+    std::cout << "=== SortPair 算法测试 ===" << std::endl;
+    
+    // 检查参数
+    std::string mode = "all";
+    if (argc > 1) {
+        mode = argv[1];
+    }
+    
+    bool correctness_passed = true;
+    bool performance_completed = true;
+    
+    try {
+        if (mode == "correctness" || mode == "all") {
+            correctness_passed = testCorrectness();
+        }
+        
+        if (mode == "performance" || mode == "all") {
+            if (correctness_passed || mode == "performance") {
+                benchmarkPerformance();
+            } else {
+                std::cout << "跳过性能测试，因为正确性测试未通过" << std::endl;
+                performance_completed = false;
+            }
+        }
+        
+        std::cout << "\n=== 测试完成 ===" << std::endl;
+        std::cout << "实现状态: " << SortPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl;
+        if (mode == "all") {
+            std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl;
+            std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl;
+        }
+        
+        return correctness_passed ? 0 : 1;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "测试出错: " << e.what() << std::endl;
+        return 1;
+    }
+}
--- a/example/cp_template/test_utils.h
+++ b/example/cp_template/test_utils.h
@@ -0,0 +1,234 @@
+#pragma once
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <mc_runtime.h>
+#include <maca_fp16.h>
+#include <iostream>
+#include <chrono>
+#include <cmath>
+
+// 引入模块化头文件
+#include "yaml_reporter.h"
+#include "performance_utils.h"
+
+// ============================================================================
+// 测试配置常量
+// ============================================================================
+#ifndef RUN_FULL_TEST
+const int TEST_SIZES[] = {1000000, 134217728}; // 1M, 128M, 512M, 1G
+#else
+const int TEST_SIZES[] = {1000000, 134217728, 536870912, 1073741824}; // 1M, 128M, 512M, 1G
+#endif
+
+const int NUM_TEST_SIZES = sizeof(TEST_SIZES) / sizeof(TEST_SIZES[0]);
+
+// 性能测试重复次数
+constexpr int WARMUP_ITERATIONS = 5;
+constexpr int BENCHMARK_ITERATIONS = 10;
+
+
+// ============================================================================
+// 错误检查宏
+// ============================================================================
+#define MACA_CHECK(call) \
+    do { \
+        mcError_t error = call; \
+        if (error != mcSuccess) { \
+            std::cerr << "MACA error at " << __FILE__ << ":" << __LINE__ \
+                      << " - " << mcGetErrorString(error) << std::endl; \
+            exit(1); \
+        } \
+    } while(0)
+
+// ============================================================================
+// 测试数据生成器
+// ============================================================================
+class TestDataGenerator {
+private:
+    std::mt19937 rng;
+    
+public:
+    TestDataGenerator(uint32_t seed = 42) : rng(seed) {}
+    
+    // 生成随机float数组
+    std::vector<float> generateRandomFloats(int size, float min_val = -1000.0f, float max_val = 1000.0f) {
+        std::vector<float> data(size);
+        std::uniform_real_distribution<float> dist(min_val, max_val);
+        for (int i = 0; i < size; i++) {
+            data[i] = dist(rng);
+        }
+        return data;
+    }
+    
+    // 生成随机half数组
+    std::vector<half> generateRandomHalfs(int size, float min_val = -100.0f, float max_val = 100.0f) {
+        std::vector<half> data(size);
+        std::uniform_real_distribution<float> dist(min_val, max_val);
+        for (int i = 0; i < size; i++) {
+            data[i] = __float2half(dist(rng));
+        }
+        return data;
+    }
+    
+    // 生成随机uint32_t数组
+    std::vector<uint32_t> generateRandomUint32(int size) {
+        std::vector<uint32_t> data(size);
+        for (int i = 0; i < size; i++) {
+            data[i] = static_cast<uint32_t>(i); // 使用索引作为值，便于验证稳定排序
+        }
+        return data;
+    }
+    
+    // 生成随机int64_t数组
+    std::vector<int64_t> generateRandomInt64(int size) {
+        std::vector<int64_t> data(size);
+        for (int i = 0; i < size; i++) {
+            data[i] = static_cast<int64_t>(i);
+        }
+        return data;
+    }
+    
+    // 生成包含NaN和Inf的测试数据 (half版本)
+    std::vector<half> generateSpecialHalfs(int size) {
+        std::vector<half> data = generateRandomHalfs(size, -10.0f, 10.0f);
+        if (size > 100) {
+            data[10] = __float2half(NAN);
+            data[20] = __float2half(INFINITY);
+            data[30] = __float2half(-INFINITY);
+        }
+        return data;
+    }
+    
+    // 生成包含NaN和Inf的测试数据 (float版本)
+    std::vector<float> generateSpecialFloats(int size) {
+        std::vector<float> data = generateRandomFloats(size, -10.0f, 10.0f);
+        if (size > 100) {
+            data[10] = NAN;
+            data[20] = INFINITY;
+            data[30] = -INFINITY;
+        }
+        return data;
+    }
+};
+
+// ============================================================================
+// 性能测试工具
+// ============================================================================
+class PerformanceMeter {
+private:
+    mcEvent_t start, stop;
+    
+public:
+    PerformanceMeter() {
+        MACA_CHECK(mcEventCreate(&start));
+        MACA_CHECK(mcEventCreate(&stop));
+    }
+    
+    ~PerformanceMeter() {
+        mcEventDestroy(start);
+        mcEventDestroy(stop);
+    }
+    
+    void startTiming() {
+        MACA_CHECK(mcEventRecord(start));
+    }
+    
+    float stopTiming() {
+        MACA_CHECK(mcEventRecord(stop));
+        MACA_CHECK(mcEventSynchronize(stop));
+        float milliseconds = 0;
+        MACA_CHECK(mcEventElapsedTime(&milliseconds, start, stop));
+        return milliseconds;
+    }
+};
+
+// ============================================================================
+// 正确性验证工具
+// ============================================================================
+template<typename T>
+bool compareArrays(const std::vector<T>& a, const std::vector<T>& b, double tolerance = 1e-6) {
+    if (a.size() != b.size()) return false;
+    
+    for (size_t i = 0; i < a.size(); i++) {
+        if constexpr (std::is_same_v<T, half>) {
+            float fa = __half2float(a[i]);
+            float fb = __half2float(b[i]);
+            if (std::isnan(fa) && std::isnan(fb)) continue;
+            if (std::isinf(fa) && std::isinf(fb) && (fa > 0) == (fb > 0)) continue;
+            if (std::abs(fa - fb) > tolerance) return false;
+        } else if constexpr (std::is_floating_point_v<T>) {
+            if (std::isnan(a[i]) && std::isnan(b[i])) continue;
+            if (std::isinf(a[i]) && std::isinf(b[i]) && (a[i] > 0) == (b[i] > 0)) continue;
+            if (std::abs(a[i] - b[i]) > tolerance) return false;
+        } else {
+            if (a[i] != b[i]) return false;
+        }
+    }
+    return true;
+}
+
+// CPU参考实现 - 稳定排序
+template<typename KeyType, typename ValueType>
+void cpuSortPair(std::vector<KeyType>& keys, std::vector<ValueType>& values, bool descending) {
+    std::vector<std::pair<KeyType, ValueType>> pairs;
+    for (size_t i = 0; i < keys.size(); i++) {
+        pairs.emplace_back(keys[i], values[i]);
+    }
+    
+    if (descending) {
+        std::stable_sort(pairs.begin(), pairs.end(), 
+            [](const auto& a, const auto& b) { return a.first > b.first; });
+    } else {
+        std::stable_sort(pairs.begin(), pairs.end());
+    }
+    
+    for (size_t i = 0; i < pairs.size(); i++) {
+        keys[i] = pairs[i].first;
+        values[i] = pairs[i].second;
+    }
+}
+
+// CPU参考实现 - TopK
+template<typename KeyType, typename ValueType>
+void cpuTopkPair(const std::vector<KeyType>& keys_in, const std::vector<ValueType>& values_in,
+                 std::vector<KeyType>& keys_out, std::vector<ValueType>& values_out,
+                 int k, bool descending) {
+    std::vector<std::pair<KeyType, ValueType>> pairs;
+    for (size_t i = 0; i < keys_in.size(); i++) {
+        pairs.emplace_back(keys_in[i], values_in[i]);
+    }
+    
+    if (descending) {
+        std::stable_sort(pairs.begin(), pairs.end(), 
+            [](const auto& a, const auto& b) { return a.first > b.first; });
+    } else {
+        std::stable_sort(pairs.begin(), pairs.end());
+    }
+    
+    keys_out.resize(k);
+    values_out.resize(k);
+    for (int i = 0; i < k; i++) {
+        keys_out[i] = pairs[i].first;
+        values_out[i] = pairs[i].second;
+    }
+}
+
+// CPU参考实现 - ReduceSum (使用double精度)
+template<typename InputT>
+double cpuReduceSum(const std::vector<InputT>& data, double init_value) {
+    double sum = init_value;
+    for (const auto& val : data) {
+        if constexpr (std::is_same_v<InputT, half>) {
+            float f_val = __half2float(val);
+            if (!std::isnan(f_val)) {
+                sum += static_cast<double>(f_val);
+            }
+        } else {
+            if (!std::isnan(val)) {
+                sum += static_cast<double>(val);
+            }
+        }
+    }
+    return sum;
+}
--- a/example/cp_template/topk_pair_algorithm.maca
+++ b/example/cp_template/topk_pair_algorithm.maca
@@ -0,0 +1,317 @@
+#include "test_utils.h"
+#include "performance_utils.h"
+#include "yaml_reporter.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <fstream>
+#include <map>
+#include <chrono>
+
+// ============================================================================
+// 实现标记宏 - 参赛者修改实现时请将此宏设为0
+// ============================================================================
+#ifndef USE_DEFAULT_REF_IMPL
+#define USE_DEFAULT_REF_IMPL 1  // 1=默认实现, 0=参赛者自定义实现
+#endif
+
+#if USE_DEFAULT_REF_IMPL
+#include <thrust/sort.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/copy.h>
+#endif
+
+static const int TOPK_VALUES[] = {32, 50, 100, 256, 1024};
+static const int NUM_TOPK_VALUES = sizeof(TOPK_VALUES) / sizeof(TOPK_VALUES[0]);
+
+// ============================================================================
+// TopkPair算法实现接口
+// 参赛者需要替换Thrust实现为自己的高性能kernel
+// ============================================================================
+
+template <typename KeyType, typename ValueType>
+class TopkPairAlgorithm {
+public:
+    // 主要接口函数 - 参赛者需要实现这个函数
+    void topk(const KeyType* d_keys_in, KeyType* d_keys_out,
+              const ValueType* d_values_in, ValueType* d_values_out,
+              int num_items, int k, bool descending) {
+        
+#if !USE_DEFAULT_REF_IMPL
+        // ========================================
+        // 参赛者自定义实现区域
+        // ========================================
+        
+        // TODO: 参赛者在此实现自己的高性能TopK算法
+        
+        // 示例：参赛者可以调用多个自定义kernel
+        // TopkKernel1<<<grid, block>>>(d_keys_in, d_values_in, temp_results, num_items, k);
+        // TopkKernel2<<<grid, block>>>(temp_results, d_keys_out, d_values_out, k, descending);
+#else
+        // ========================================
+        // 默认基准实现
+        // ========================================
+        
+        KeyType* temp_keys;
+        ValueType* temp_values;
+        MACA_CHECK(mcMalloc(&temp_keys, num_items * sizeof(KeyType)));
+        MACA_CHECK(mcMalloc(&temp_values, num_items * sizeof(ValueType)));
+        
+        MACA_CHECK(mcMemcpy(temp_keys, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice));
+        MACA_CHECK(mcMemcpy(temp_values, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice));
+        
+        auto key_ptr = thrust::device_pointer_cast(temp_keys);
+        auto value_ptr = thrust::device_pointer_cast(temp_values);
+        
+        // 由于greater和less是不同类型，需要分别调用
+        if (descending) {
+            thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>());
+        } else {
+            thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>());
+        }
+
+        MACA_CHECK(mcMemcpy(d_keys_out, temp_keys, k * sizeof(KeyType), mcMemcpyDeviceToDevice));
+        MACA_CHECK(mcMemcpy(d_values_out, temp_values, k * sizeof(ValueType), mcMemcpyDeviceToDevice));
+
+        mcFree(temp_keys);
+        mcFree(temp_values);
+#endif
+    }
+    
+    // 获取当前实现状态
+    static const char* getImplementationStatus() {
+#if USE_DEFAULT_REF_IMPL
+        return "DEFAULT_REF_IMPL";
+#else
+        return "CUSTOM_IMPL";
+#endif
+    }
+    
+private:
+    // 参赛者可以在这里添加辅助函数和成员变量
+    // 例如：分块大小、临时缓冲区、多流处理等
+};
+
+// ============================================================================
+// 测试和性能评估
+// ============================================================================
+
+bool testCorrectness() {
+    std::cout << "TopkPair 正确性测试..." << std::endl;
+    TestDataGenerator generator;
+    TopkPairAlgorithm<float, uint32_t> algorithm;
+    
+    int size = 10000;
+    auto keys = generator.generateRandomFloats(size);
+    auto values = generator.generateRandomUint32(size);
+    
+    // 分配GPU内存
+    float *d_keys_in, *d_keys_out;
+    uint32_t *d_values_in, *d_values_out;
+    
+    MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float)));
+    MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t)));
+    
+    MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice));
+    MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice));
+    
+    bool allPassed = true;
+    
+    // 测试不同k值
+    for (int ki = 0; ki < NUM_TOPK_VALUES && ki < 4; ki++) { // 限制测试范围
+        int k = TOPK_VALUES[ki];
+        if (k > size) continue;
+        
+        std::cout << "  测试 k=" << k << std::endl;
+        
+        MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float)));
+        MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t)));
+        
+        for (bool descending : {false, true}) {
+            std::cout << "    " << (descending ? "降序" : "升序") << " TopK..." << std::endl;
+            
+            // CPU参考结果
+            std::vector<float> cpu_keys_out;
+            std::vector<uint32_t> cpu_values_out;
+            cpuTopkPair(keys, values, cpu_keys_out, cpu_values_out, k, descending);
+            
+            // GPU算法结果
+            algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending);
+            
+            // 获取结果
+            std::vector<float> gpu_keys_out(k);
+            std::vector<uint32_t> gpu_values_out(k);
+            MACA_CHECK(mcMemcpy(gpu_keys_out.data(), d_keys_out, k * sizeof(float), mcMemcpyDeviceToHost));
+            MACA_CHECK(mcMemcpy(gpu_values_out.data(), d_values_out, k * sizeof(uint32_t), mcMemcpyDeviceToHost));
+            
+            // 验证结果
+            bool keysMatch = compareArrays(cpu_keys_out, gpu_keys_out, 1e-5);
+            bool valuesMatch = compareArrays(cpu_values_out, gpu_values_out);
+            
+            if (!keysMatch || !valuesMatch) {
+                std::cout << "    失败: 结果不匹配" << std::endl;
+                allPassed = false;
+            } else {
+                std::cout << "    通过" << std::endl;
+            }
+        }
+        
+        mcFree(d_keys_out);
+        mcFree(d_values_out);
+    }
+    
+    // 清理内存
+    mcFree(d_keys_in);
+    mcFree(d_values_in);
+    
+    return allPassed;
+}
+
+void benchmarkPerformance() {
+    std::cout << "\nTopkPair 性能测试..." << std::endl;
+    std::cout << "数据类型: <float, uint32_t>" << std::endl;
+    std::cout << "计算公式:" << std::endl;
+    std::cout << "  吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl;
+    
+    TestDataGenerator generator;
+    PerformanceMeter meter;
+    TopkPairAlgorithm<float, uint32_t> algorithm;
+    
+    const int WARMUP_ITERATIONS = 5;
+    const int BENCHMARK_ITERATIONS = 10;
+    
+    // 用于YAML报告的数据收集
+    std::vector<std::map<std::string, std::string>> perf_data;
+    
+    // 针对不同数据规模测试
+    for (int size_idx = 0; size_idx < NUM_TEST_SIZES; size_idx++) {
+        int size = TEST_SIZES[size_idx];
+        std::cout << "\n数据规模: " << size << std::endl;
+        std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" 
+                  << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl;
+        std::cout << std::string(74, '-') << std::endl;
+        
+        auto keys = generator.generateRandomFloats(size);
+        auto values = generator.generateRandomUint32(size);
+        
+        // 分配GPU内存
+        float *d_keys_in;
+        uint32_t *d_values_in;
+        
+        MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float)));
+        MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t)));
+        
+        MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice));
+        MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice));
+        
+        for (int ki = 0; ki < NUM_TOPK_VALUES; ki++) {
+            int k = TOPK_VALUES[ki];
+            if (k > size) continue;
+            
+            float *d_keys_out;
+            uint32_t *d_values_out;
+            MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float)));
+            MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t)));
+            
+            float asc_time = 0, desc_time = 0;
+            
+            for (bool descending : {false, true}) {
+                // Warmup阶段
+                for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) {
+                    algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending);
+                }
+                
+                // 正式测试阶段
+                float total_time = 0;
+                for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
+                    meter.startTiming();
+                    algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending);
+                    total_time += meter.stopTiming();
+                }
+                
+                float avg_time = total_time / BENCHMARK_ITERATIONS;
+                if (descending) {
+                    desc_time = avg_time;
+                } else {
+                    asc_time = avg_time;
+                }
+            }
+            
+            // 计算性能指标
+            auto asc_metrics = PerformanceCalculator::calculateTopkPair(size, k, asc_time);
+            auto desc_metrics = PerformanceCalculator::calculateTopkPair(size, k, desc_time);
+            
+            // 显示性能数据
+            PerformanceDisplay::printTopkPairData(k, asc_time, desc_time, asc_metrics, desc_metrics);
+            
+            // 收集YAML报告数据
+            auto entry = YAMLPerformanceReporter::createEntry();
+            entry["data_size"] = std::to_string(size);
+            entry["k_value"] = std::to_string(k);
+            entry["asc_time_ms"] = std::to_string(asc_time);
+            entry["desc_time_ms"] = std::to_string(desc_time);
+            entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps);
+            entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps);
+            entry["key_type"] = "float";
+            entry["value_type"] = "uint32_t";
+            perf_data.push_back(entry);
+
+            mcFree(d_keys_out);
+            mcFree(d_values_out);
+        }
+        
+        mcFree(d_keys_in);
+        mcFree(d_values_in);
+    }
+    
+    // 生成YAML性能报告
+    YAMLPerformanceReporter::generateTopkPairYAML(perf_data, "topk_pair_performance.yaml");
+    PerformanceDisplay::printSavedMessage("topk_pair_performance.yaml");
+}
+
+// ============================================================================
+// 主函数
+// ============================================================================
+int main(int argc, char* argv[]) {
+    std::cout << "=== TopkPair 算法测试 ===" << std::endl;
+    
+    // 检查参数
+    std::string mode = "all";
+    if (argc > 1) {
+        mode = argv[1];
+    }
+    
+    bool correctness_passed = true;
+    bool performance_completed = true;
+    
+    try {
+        if (mode == "correctness" || mode == "all") {
+            correctness_passed = testCorrectness();
+        }
+        
+        if (mode == "performance" || mode == "all") {
+            if (correctness_passed || mode == "performance") {
+                benchmarkPerformance();
+            } else {
+                std::cout << "跳过性能测试，因为正确性测试未通过" << std::endl;
+                performance_completed = false;
+            }
+        }
+        
+        std::cout << "\n=== 测试完成 ===" << std::endl;
+        std::cout << "实现状态: " << TopkPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl;
+        if (mode == "all") {
+            std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl;
+            std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl;
+        }
+        
+        return correctness_passed ? 0 : 1;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "测试出错: " << e.what() << std::endl;
+        return 1;
+    }
+}
--- a/example/cp_template/yaml_reporter.h
+++ b/example/cp_template/yaml_reporter.h
@@ -0,0 +1,154 @@
+#pragma once
+#include <fstream>
+#include <vector>
+#include <map>
+#include <string>
+#include <chrono>
+#include <iomanip>
+#include <sstream>
+
+// ============================================================================
+// YAML性能报告生成器
+// ============================================================================
+
+class YAMLPerformanceReporter {
+public:
+    struct PerformanceData {
+        std::string algorithm;
+        std::string input_type;
+        std::string output_type;
+        std::string key_type;
+        std::string value_type;
+        std::vector<std::map<std::string, std::string>> metrics;
+    };
+
+    // 创建性能数据条目
+    static std::map<std::string, std::string> createEntry() {
+        return std::map<std::string, std::string>();
+    }
+
+    // 生成ReduceSum性能YAML
+    static void generateReduceSumYAML(const std::vector<std::map<std::string, std::string>>& perf_data, 
+                                      const std::string& filename = "reduce_sum_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "ReduceSum算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"ReduceSum\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  input: \"float\"\n";
+        yaml_file << "  output: \"float\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    time_ms: " << formatFloat(data.at("time_ms")) << "\n";
+            yaml_file << "    throughput_gps: " << formatFloat(data.at("throughput_gps")) << "\n";
+            yaml_file << "    data_type: \"" << data.at("data_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+    // 生成SortPair性能YAML
+    static void generateSortPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data,
+                                     const std::string& filename = "sort_pair_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "SortPair算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"SortPair\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  key_type: \"float\"\n";
+        yaml_file << "  value_type: \"uint32_t\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    ascending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n";
+            yaml_file << "    descending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n";
+            yaml_file << "    key_type: \"" << data.at("key_type") << "\"\n";
+            yaml_file << "    value_type: \"" << data.at("value_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+    // 生成TopkPair性能YAML
+    static void generateTopkPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data,
+                                     const std::string& filename = "topk_pair_performance.yaml") {
+        std::ofstream yaml_file(filename);
+        
+        // 写入头部信息
+        writeHeader(yaml_file, "TopkPair算法性能测试结果");
+        
+        // 算法信息
+        yaml_file << "algorithm: \"TopkPair\"\n";
+        yaml_file << "data_types:\n";
+        yaml_file << "  key_type: \"float\"\n";
+        yaml_file << "  value_type: \"uint32_t\"\n";
+        
+        // 计算公式
+        yaml_file << "formulas:\n";
+        yaml_file << "  throughput: \"elements / time(s) / 1e9 (G/s)\"\n";
+        
+        // 性能数据
+        yaml_file << "performance_data:\n";
+        for (const auto& data : perf_data) {
+            yaml_file << "  - data_size: " << data.at("data_size") << "\n";
+            yaml_file << "    k_value: " << data.at("k_value") << "\n";
+            yaml_file << "    ascending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n";
+            yaml_file << "    descending:\n";
+            yaml_file << "      time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n";
+            yaml_file << "      throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n";
+            yaml_file << "    key_type: \"" << data.at("key_type") << "\"\n";
+            yaml_file << "    value_type: \"" << data.at("value_type") << "\"\n";
+        }
+        
+        yaml_file.close();
+    }
+
+private:
+    // 写入YAML文件头部
+    static void writeHeader(std::ofstream& file, const std::string& title) {
+        file << "# " << title << "\n";
+        file << "# 生成时间: ";
+        
+        auto now = std::chrono::system_clock::now();
+        auto time_t = std::chrono::system_clock::to_time_t(now);
+        file << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S");
+        file << "\n\n";
+    }
+
+    // 格式化浮点数
+    static std::string formatFloat(const std::string& value) {
+        try {
+            double d = std::stod(value);
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(6) << d;
+            return oss.str();
+        } catch (...) {
+            return value;
+        }
+    }
+};