GPUKernelContest/cp_template/reduce_sum_algorithm.maca

277 lines
9.7 KiB
Plaintext
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "test_utils.h"
#include "performance_utils.h"
#include "yaml_reporter.h"
#include <iostream>
#include <vector>
#include <iomanip>
// ============================================================================
// 实现标记宏 - 参赛者修改实现时请将此宏设为0
// ============================================================================
#ifndef USE_DEFAULT_REF_IMPL
#define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现
#endif
#if USE_DEFAULT_REF_IMPL
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#endif
// 误差容忍度
constexpr double REDUCE_ERROR_TOLERANCE = 0.005; // 0.5%
// ============================================================================
// ReduceSum算法实现接口
// 参赛者需要替换Thrust实现为自己的高性能kernel
// ============================================================================
template <typename InputT = float, typename OutputT = float>
class ReduceSumAlgorithm {
public:
// 主要接口函数 - 参赛者需要实现这个函数
void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) {
#if !USE_DEFAULT_REF_IMPL
// ========================================
// 参赛者自定义实现区域
// ========================================
// TODO: 参赛者在此实现自己的高性能归约算法
// 示例参赛者可以调用1个或多个自定义kernel
// blockReduceKernel<<<grid, block>>>(d_in, temp_results, num_items, init_value);
// finalReduceKernel<<<1, block>>>(temp_results, d_out, grid.x);
#else
// ========================================
// 默认基准实现
// ========================================
auto input_ptr = thrust::device_pointer_cast(d_in);
auto output_ptr = thrust::device_pointer_cast(d_out);
// 直接使用thrust::reduce进行归约
*output_ptr = thrust::reduce(
thrust::device,
input_ptr,
input_ptr + num_items,
static_cast<OutputT>(init_value)
);
#endif
}
// 获取当前实现状态
static const char* getImplementationStatus() {
#if USE_DEFAULT_REF_IMPL
return "DEFAULT_REF_IMPL";
#else
return "CUSTOM_IMPL";
#endif
}
private:
// 参赛者可以在这里添加辅助函数和成员变量
// 例如:中间结果缓冲区、多阶段归约等
};
// ============================================================================
// 测试和性能评估
// ============================================================================
bool testCorrectness() {
std::cout << "ReduceSum 正确性测试..." << std::endl;
TestDataGenerator generator;
ReduceSumAlgorithm<float, float> algorithm;
bool allPassed = true;
// 测试不同数据规模
for (int i = 0; i < NUM_TEST_SIZES && i < 2; i++) { // 限制测试规模
int size = std::min(TEST_SIZES[i], 10000);
std::cout << " 测试规模: " << size << std::endl;
// 测试普通数据
{
auto data = generator.generateRandomFloats(size, -10.0f, 10.0f);
float init_value = 1.0f;
// CPU参考计算
double cpu_result = cpuReduceSum(data, static_cast<double>(init_value));
// GPU计算
float *d_in;
float *d_out;
MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
algorithm.reduce(d_in, d_out, size, init_value);
float gpu_result;
MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost));
// 验证误差
double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result);
if (relative_error > REDUCE_ERROR_TOLERANCE) {
std::cout << " 失败: 误差过大 " << relative_error << std::endl;
allPassed = false;
} else {
std::cout << " 通过 (误差: " << relative_error << ")" << std::endl;
}
mcFree(d_in);
mcFree(d_out);
}
// 测试特殊值 (NaN, Inf)
if (size > 100) {
std::cout << " 测试特殊值..." << std::endl;
auto data = generator.generateSpecialFloats(size);
float init_value = 0.0f;
double cpu_result = cpuReduceSum(data, static_cast<double>(init_value));
float *d_in;
float *d_out;
MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
algorithm.reduce(d_in, d_out, size, init_value);
float gpu_result;
MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost));
// 对于包含特殊值的情况,检查是否正确处理
if (std::isfinite(cpu_result) && std::isfinite(gpu_result)) {
double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result);
if (relative_error > REDUCE_ERROR_TOLERANCE) {
std::cout << " 失败: 特殊值处理错误" << std::endl;
allPassed = false;
} else {
std::cout << " 通过 (特殊值处理)" << std::endl;
}
} else {
std::cout << " 通过 (特殊值结果)" << std::endl;
}
mcFree(d_in);
mcFree(d_out);
}
}
return allPassed;
}
void benchmarkPerformance() {
PerformanceDisplay::printReduceSumHeader();
TestDataGenerator generator;
PerformanceMeter meter;
ReduceSumAlgorithm<float, float> algorithm;
const int WARMUP_ITERATIONS = 5;
const int BENCHMARK_ITERATIONS = 10;
// 用于YAML报告的数据收集
std::vector<std::map<std::string, std::string>> perf_data;
for (int i = 0; i < NUM_TEST_SIZES; i++) {
int size = TEST_SIZES[i];
// 生成测试数据
auto data = generator.generateRandomFloats(size);
float init_value = 0.0f;
// 分配GPU内存
float *d_in;
float *d_out;
MACA_CHECK(mcMalloc(&d_in, size * sizeof(float)));
MACA_CHECK(mcMalloc(&d_out, sizeof(float)));
MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice));
// Warmup阶段
for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) {
algorithm.reduce(d_in, d_out, size, init_value);
}
// 正式测试阶段
float total_time = 0;
for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
meter.startTiming();
algorithm.reduce(d_in, d_out, size, init_value);
total_time += meter.stopTiming();
}
float avg_time = total_time / BENCHMARK_ITERATIONS;
// 计算性能指标
auto metrics = PerformanceCalculator::calculateReduceSum(size, avg_time);
// 显示性能数据
PerformanceDisplay::printReduceSumData(size, avg_time, metrics);
// 收集YAML报告数据
auto entry = YAMLPerformanceReporter::createEntry();
entry["data_size"] = std::to_string(size);
entry["time_ms"] = std::to_string(avg_time);
entry["throughput_gps"] = std::to_string(metrics.throughput_gps);
entry["data_type"] = "float";
perf_data.push_back(entry);
mcFree(d_in);
mcFree(d_out);
}
// 生成YAML性能报告
YAMLPerformanceReporter::generateReduceSumYAML(perf_data, "reduce_sum_performance.yaml");
PerformanceDisplay::printSavedMessage("reduce_sum_performance.yaml");
}
// ============================================================================
// 主函数
// ============================================================================
int main(int argc, char* argv[]) {
std::cout << "=== ReduceSum 算法测试 ===" << std::endl;
// 检查参数
std::string mode = "all";
if (argc > 1) {
mode = argv[1];
}
bool correctness_passed = true;
bool performance_completed = true;
try {
if (mode == "correctness" || mode == "all") {
correctness_passed = testCorrectness();
}
if (mode == "performance" || mode == "all") {
if (correctness_passed || mode == "performance") {
benchmarkPerformance();
} else {
std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl;
performance_completed = false;
}
}
std::cout << "\n=== 测试完成 ===" << std::endl;
std::cout << "实现状态: " << ReduceSumAlgorithm<float, float>::getImplementationStatus() << std::endl;
if (mode == "all") {
std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl;
std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl;
}
return correctness_passed ? 0 : 1;
} catch (const std::exception& e) {
std::cerr << "测试出错: " << e.what() << std::endl;
return 1;
}
}