2026-03-03

7. 内存访问 —— 矩阵转置思考

矩阵转置，究竟是 “行优先存储” 快，还是 “列优先存储” 快，取决于矩阵大小

#include <iostream>
#include <vector>
#include <format>
#include <cuda_runtime.h>

#define CUDA_CHECK(call)                                                    \
		do {                                                                    \
		cudaError_t const status = (call);                                  \
		if (status != cudaSuccess) {                                        \
		std::cerr << std::format("硬件状态异常诊断:\n  API: {}\n  错误: {}\n  位置: {}:{}\n", \
		#call, cudaGetErrorString(status), FILE, LINE); \
		std::exit(EXIT_FAILURE);                                        \
		}                                                                   \
		} while (0)

// 方案 A：按行读取（合并访存），按列写入（跨步访存）
__global__ void transposeReadRowWriteCol(const float* restrict A, float* restrict B, int N) {
		int x = blockIdx.x * blockDim.x + threadIdx.x;
		int y = blockIdx.y * blockDim.y + threadIdx.y;
		if (x < N && y < N) {
				// 读取 A: 相邻线程的 x 连续递增，物理地址连续 -> 合并读取
				// 写入 B: 相邻线程的 x 连续递增，物理地址间隔 N -> 跨步写入
				B[x * N + y] = A[y * N + x];
		}
}
// 方案 B：按列读取（跨步访存），按行写入（合并访存）
__global__ void transposeReadColWriteRow(const float* restrict A, float* restrict B, int N) {
		int x = blockIdx.x * blockDim.x + threadIdx.x;
		int y = blockIdx.y * blockDim.y + threadIdx.y;
		if (x < N && y < N) {
				// 读取 A: 相邻线程的 x 连续递增，物理地址间隔 N -> 跨步读取
				// 写入 B: 相邻线程的 x 连续递增，物理地址连续 -> 合并写入
				B[y * N + x] = A[x * N + y];
		}
}

		int main() {
		// 设定测试矩阵的物理维度 (4096 x 4096 = 16,777,216 个元素)
		const int N = 4096;
		const size_t elementCount = N * N;
		const size_t byteSize = elementCount * sizeof(float);
		std::vector<float> h_A(elementCount, 1.0f);
		std::vector<float> h_B(elementCount, 0.0f);
		
		float *d_A, *d_B;
		CUDA_CHECK(cudaMalloc((void**)&d_A, byteSize));
		CUDA_CHECK(cudaMalloc((void**)&d_B, byteSize));
		CUDA_CHECK(cudaMemcpy(d_A, h_A.data(), byteSize, cudaMemcpyHostToDevice));
		
		dim3 blockSize(32, 16);
		dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);
		
		cudaEvent_t start, stop;
		CUDA_CHECK(cudaEventCreate(&start));
		CUDA_CHECK(cudaEventCreate(&stop));
		std::cout << std::format("启动矩阵规模 {} x {} 的硬件性能评估:\n", N, N);
		std::cout << std::string(50, '-') << "\n";
		
		// 测试方案 A
		transposeReadRowWriteCol<<<gridSize, blockSize>>>(d_A, d_B, N);
		CUDA_CHECK(cudaDeviceSynchronize()); // 预热
		CUDA_CHECK(cudaEventRecord(start));
		transposeReadRowWriteCol<<<gridSize, blockSize>>>(d_A, d_B, N);
		CUDA_CHECK(cudaEventRecord(stop));
		CUDA_CHECK(cudaEventSynchronize(stop));
		float ms_A = 0;
		CUDA_CHECK(cudaEventElapsedTime(&ms_A, start, stop));
		
		// 测试方案 B
		transposeReadColWriteRow<<<gridSize, blockSize>>>(d_A, d_B, N);
		CUDA_CHECK(cudaDeviceSynchronize()); // 预热
		CUDA_CHECK(cudaEventRecord(start));
		transposeReadColWriteRow<<<gridSize, blockSize>>>(d_A, d_B, N);
		CUDA_CHECK(cudaEventRecord(stop));
		CUDA_CHECK(cudaEventSynchronize(stop));
		float ms_B = 0;
		CUDA_CHECK(cudaEventElapsedTime(&ms_B, start, stop));
		
		// 输出诊断数据
		std::cout << std::format("{:<30} | {:<15}\n", "执行方案", "物理耗时 (ms)");
		std::cout << std::string(50, '-') << "\n";
		std::cout << std::format("{:<30} | {:<15.4f}\n", "方案 A (按行读 / 按列写)", ms_A);
		std::cout << std::format("{:<30} | {:<15.4f}\n", "方案 B (按列读 / 按行写)", ms_B);
		CUDA_CHECK(cudaEventDestroy(start));
		CUDA_CHECK(cudaEventDestroy(stop));
		CUDA_CHECK(cudaFree(d_A));
		CUDA_CHECK(cudaFree(d_B));
		return 0;
}

输出结果：

--------------------------------------------------
启动矩阵规模 1024 x 1024 的硬件性能评估:
--------------------------------------------------
执行方案                   | 物理耗时 (ms)
--------------------------------------------------
方案 A (按行读 / 按列写) | 0.0314         
方案 B (按列读 / 按行写) | 0.0136         

--------------------------------------------------
启动矩阵规模 4096 x 4096 的硬件性能评估:
--------------------------------------------------
执行方案                   | 物理耗时 (ms)
--------------------------------------------------
方案 A (按行读 / 按列写) | 0.4261         
方案 B (按列读 / 按行写) | 0.4670

【CUDA学习】07-GPU 内存访问 —— 矩阵转置思考

7. 内存访问 —— 矩阵转置思考

说些什么吧！