Why is my “Second naive” SGEMM kernel faster than the “global memory coalesced” version?

14 hours ago 2
ARTICLE AD BOX

I am benchmarking several very simple CUDA SGEMM kernels on an NVIDIA Hopper GPU (H800, sm_90), and I observed something that I do not fully understand.

I have two kernels that, to my understanding, should have essentially the same global memory access pattern, but the “coalesced” version consistently outperforms the explicitly “second naive” version.

Below is the reproducible codes.

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <cuda_runtime.h> #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) // ==================== First naive version ==================== __global__ void sgemm_naive1(int M, int N, int K, float alpha, const float *A, const float *B, float beta, float *C) { const uint x = blockIdx.x * blockDim.x + threadIdx.x; const uint y = blockIdx.y * blockDim.y + threadIdx.y; if (x < M && y < N) { float tmp = 0.0f; for (int i = 0; i < K; ++i) { tmp += A[x * K + i] * B[i * N + y]; } C[x * N + y] = alpha * tmp + beta * C[x * N + y]; } } void run_sgemm_naive1(int M, int N, int K, float alpha, float *A, float *B, float beta, float *C) { dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32)); dim3 blockDim(32, 32); sgemm_naive1<<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C); } // ==================== Second naive version (better performance) ==================== __global__ void sgemm_naive2(int M, int N, int K, float alpha, const float *A, const float *B, float beta, float *C) { const uint x = blockIdx.x * blockDim.x + threadIdx.x; const uint y = blockIdx.y * blockDim.y + threadIdx.y; if (x < M && y < N) { float tmp = 0.0f; for (int i = 0; i < K; ++i) { tmp += A[y * K + i] * B[i * N + x]; } C[y * N + x] = alpha * tmp + beta * C[y * N + x]; } } void run_sgemm_naive2(int M, int N, int K, float alpha, float *A, float *B, float beta, float *C) { dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32)); dim3 blockDim(32, 32); sgemm_naive2<<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C); } // ==================== Global memory coalesced version ==================== template <const uint BLOCKSIZE> __global__ void sgemm_global_mem_coalesce(int M, int N, int K, float alpha, const float *A, const float *B, float beta, float *C) { const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE); const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE); if (cRow < M && cCol < N) { float tmp = 0.0f; for (int i = 0; i < K; ++i) { tmp += A[cRow * K + i] * B[i * N + cCol]; } C[cRow * N + cCol] = alpha * tmp + beta * C[cRow * N + cCol]; } } void run_sgemm_coalesce(int M, int N, int K, float alpha, float *A, float *B, float beta, float *C) { dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32)); dim3 blockDim(32 * 32); sgemm_global_mem_coalesce<32><<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C); } // ==================== Verification function ==================== void verify_results(float *C1, float *C2, int M, int N, const char *label) { float max_diff = 0.0f; int diff_count = 0; float tolerance = 1e-3f; for (int i = 0; i < M * N; i++) { float diff = fabs(C1[i] - C2[i]); if (diff > max_diff) max_diff = diff; if (diff > tolerance) diff_count++; } printf("%s verification: max difference = %e, mismatch count = %d (tolerance %e)\n", label, max_diff, diff_count, tolerance); } // ==================== Performance benchmark ==================== void benchmark_kernel(void (*kernel_func)(int, int, int, float, float*, float*, float, float*), int M, int N, int K, float *d_A, float *d_B, float *d_C, const char *kernel_name, int warmup_runs = 10, int test_runs = 100) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); float alpha = 1.0f; float beta = 0.0f; // Warm-up for (int i = 0; i < warmup_runs; i++) { kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C); } cudaDeviceSynchronize(); // Timed runs float total_time = 0.0f; for (int i = 0; i < test_runs; i++) { cudaEventRecord(start); kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C); cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); total_time += milliseconds; } float avg_time = total_time / test_runs; float gflops = 2.0 * M * N * K / (avg_time * 1e6); // 2 FLOPs per MAC printf("Kernel: %s\n", kernel_name); printf(" Average time: %.3f ms\n", avg_time); printf(" Performance: %.2f GFLOPS\n", gflops); printf(" Total runs: %d\n\n", test_runs); cudaEventDestroy(start); cudaEventDestroy(stop); } // ==================== Main ==================== int main() { // Matrix dimensions int M = 1024; int N = 1024; int K = 1024; printf("Matrix multiplication performance test\n"); printf("M = %d, N = %d, K = %d\n", M, N, K); printf("Total computation: %.2f GFLOPs\n\n", 2.0 * M * N * K / 1e9); // Allocate host memory size_t size_A = M * K * sizeof(float); size_t size_B = K * N * sizeof(float); size_t size_C = M * N * sizeof(float); float *h_A = (float*)malloc(size_A); float *h_B = (float*)malloc(size_B); float *h_C1 = (float*)malloc(size_C); float *h_C2 = (float*)malloc(size_C); float *h_C3 = (float*)malloc(size_C); // Initialize matrices srand(time(NULL)); for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX; for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX; for (int i = 0; i < M * N; i++) h_C1[i] = (float)rand() / RAND_MAX; memcpy(h_C2, h_C1, size_C); memcpy(h_C3, h_C1, size_C); // Allocate device memory float *d_A, *d_B, *d_C1, *d_C2, *d_C3; cudaMalloc(&d_A, size_A); cudaMalloc(&d_B, size_B); cudaMalloc(&d_C1, size_C); cudaMalloc(&d_C2, size_C); cudaMalloc(&d_C3, size_C); // Copy data to device cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice); // GPU info cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); printf("GPU: %s, Compute Capability: %d.%d\n\n", prop.name, prop.major, prop.minor); // Run benchmarks printf("=== Performance Benchmark ===\n"); cudaMemcpy(d_C1, h_C1, size_C, cudaMemcpyHostToDevice); benchmark_kernel(run_sgemm_naive1, M, N, K, d_A, d_B, d_C1, "Naive Version 1"); cudaMemcpy(d_C2, h_C2, size_C, cudaMemcpyHostToDevice); benchmark_kernel(run_sgemm_naive2, M, N, K, d_A, d_B, d_C2, "Naive Version 2"); cudaMemcpy(d_C3, h_C3, size_C, cudaMemcpyHostToDevice); benchmark_kernel(run_sgemm_coalesce, M, N, K, d_A, d_B, d_C3, "Coalesced Version"); // Verify results printf("=== Result Verification ===\n"); cudaMemcpy(h_C1, d_C1, size_C, cudaMemcpyDeviceToHost); cudaMemcpy(h_C2, d_C2, size_C, cudaMemcpyDeviceToHost); cudaMemcpy(h_C3, d_C3, size_C, cudaMemcpyDeviceToHost); verify_results(h_C1, h_C3, M, N, "Kernel1 vs Kernel3"); verify_results(h_C1, h_C2, M, N, "Kernel1 vs Kernel2"); // Cleanup free(h_A); free(h_B); free(h_C1); free(h_C2); free(h_C3); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C1); cudaFree(d_C2); cudaFree(d_C3); printf("\nTest completed!\n"); return 0; }

The result in my computer is

=== Performance Benchmark === Kernel: Naive Version 1 Average time: 4.429 ms Performance: 484.88 GFLOPS Total runs: 100 Kernel: Naive Version 2 Average time: 0.461 ms Performance: 4661.59 GFLOPS Total runs: 100 Kernel: Coalesced Version Average time: 0.344 ms Performance: 6236.03 GFLOPS Total runs: 100
Read Entire Article