导言
作为现在的高性能计算开发者,Nvidia 的生态是我们绕不开的一关,而 Nvidia 的 CUDA 编程模型虽然基于 SIMT 编写起来较为自然,但是随着 Tensorcore 这一类专用单元的增多,编写高性能的 kernel 的学习曲线变得越来越陡峭。为此,Nvidia 提供了 CUTLASS 这个仓库,辅助大家利用新的每一代的 GPU 架构。哪怕不利用 CUTLASS 编写,学习 CUTLASS 也是编写高性能内核不可或缺的一环。

这里借助本文对 CUTLASS 进行入门的学习。
快速上手
我们需要预先安装好下面的依赖,一般装有较新 GPU 的服务器都有下面的依赖
- NVIDIA CUDA Toolkit (11.4 or later required, 12.0 recommended)
- CMake 3.18+
- host compiler supporting C++17 or greater (minimum g++ 7.5.0)
- Python 3.6+
- cuBLAS
- cuDNN v7.6 or later
这里使用的环境如下
(base) ➜ ~ python --versionPython 3.13.2(base) ➜ ~ module load cuda/12.8Loading cuda version 12.8(base) ➜ ~ module load cudnnLoading nvidia version 9.7.1显卡是

一台四卡 3080ti 的机器,方便编写各种场景下的代码。
接下来配置 IDE,官网给出了非常友好的 CUDA 文件辅助配置 clangd 文件如下:
CompileFlags: Compiler: /usr/local/cuda-12.8/bin/nvcc Add: - --cuda-path=/usr/local/cuda-12.838 collapsed lines
- --cuda-gpu-arch=sm_90a - -I/usr/local/cuda/include - "-xcuda"#report all errors - "-ferror-limit=0" - --cuda-gpu-arch=sm_90a - --std=c++17 - "-D__INTELLISENSE__" - "-D__CLANGD__" - "-DCUDA_12_0_SM90_FEATURES_SUPPORTED" - "-DCUTLASS_ARCH_MMA_SM90_SUPPORTED=1" - "-D_LIBCUDACXX_STD_VER=12" - "-D__CUDACC_VER_MAJOR__=12" - "-D__CUDACC_VER_MINOR__=3" - "-D__CUDA_ARCH__=900" - "-D__CUDA_ARCH_FEAT_SM90_ALL" - "-Wno-invalid-constexpr" Remove:#strip CUDA fatbin args - "-Xfatbin*"#strip CUDA arch flags - "-gencode*" - "--generate-code*"#strip CUDA flags unknown to clang - "-ccbin*" - "--compiler-options*" - "--expt-extended-lambda" - "--expt-relaxed-constexpr" - "-forward-unknown-to-host-compiler" - "-Werror=cross-execution-space-call"Hover: ShowAKA: NoInlayHints: Enabled: YesDiagnostics: Suppress: - "variadic_device_fn" - "attributes_not_allowed"基本可以保证我们的跳转不犯病。
这里我们先最小时间编译构建支持计算能力为 86 的 CUTLASS。根据官网的教程是:
cmake -S . -B build -DCUTLASS_NVCC_ARCHS=86 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON重点是开启了单元构建,方便我们只构建我们要的部分,然后编译 cutlass_profiler
cmake --build build -j12 --target cutlass_profiler接下来等待编译即可。
这个时候我们已经编译成功我们的 cutlass_profiler。
Basic GEMM 学习
int main(int argc, const char *arg[]) {
// // Parse the command line to obtain GEMM dimensions and scalar values. //
// GEMM problem dimensions. int problem[3] = { 128, 128, 128 };
for (int i = 1; i < argc && i < 4; ++i) { std::stringstream ss(arg[i]); ss >> problem[i - 1]; }
// Scalars used for linear scaling the result of the matrix product. float scalars[2] = { 1, 0 };
for (int i = 4; i < argc && i < 6; ++i) { std::stringstream ss(arg[i]); ss >> scalars[i - 4]; }
// // Run the CUTLASS GEMM test. //
cudaError_t result = TestCutlassGemm( problem[0], // GEMM M dimension problem[1], // GEMM N dimension problem[2], // GEMM K dimension scalars[0], // alpha scalars[1] // beta );
if (result == cudaSuccess) { std::cout << "Passed." << std::endl; }
// Exit. return result == cudaSuccess ? 0 : -1;}先看主函数,如上,分析的很清楚主要是封装了一下 cmd 的读取,主要是 size 和 scala 然后调用对应的测试函数,如果是 success 就返回 cudaSuccess。
cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {158 collapsed lines
cudaError_t result;
// // Define several matrices to be used as operands to GEMM kernels. //
// Compute leading dimensions for each matrix. int lda = M; int ldb = K; int ldc = M;
// Compute size in bytes of the C matrix. size_t sizeof_C = sizeof(float) * ldc * N;
// Define pointers to matrices in GPU device memory. float *A; float *B; float *C_cutlass; float *C_reference;
// // Allocate matrices in GPU device memory with arbitrary seeds. //
result = AllocateMatrix(&A, M, K, 0);
if (result != cudaSuccess) { return result; }
result = AllocateMatrix(&B, K, N, 17);
if (result != cudaSuccess) { cudaFree(A); return result; }
result = AllocateMatrix(&C_cutlass, M, N, 101);
if (result != cudaSuccess) { cudaFree(A); cudaFree(B); return result; }
result = AllocateMatrix(&C_reference, M, N, 101);
if (result != cudaSuccess) { cudaFree(A); cudaFree(B); cudaFree(C_cutlass); return result; }
result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
if (result != cudaSuccess) { std::cerr << "Failed to copy C_cutlass matrix to C_reference: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }
// // Launch CUTLASS GEMM. //
result = CutlassSgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C_cutlass, ldc);
if (result != cudaSuccess) { std::cerr << "CUTLASS GEMM kernel failed: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }
// // Verify. //
// Launch reference GEMM result = ReferenceGemm(M, N, K, alpha, A, lda, B, ldb, beta, C_reference, ldc);
if (result != cudaSuccess) { std::cerr << "Reference GEMM kernel failed: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }
// Copy to host and verify equivalence. std::vector<float> host_cutlass(ldc * N, 0); std::vector<float> host_reference(ldc * N, 0);
result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
if (result != cudaSuccess) { std::cerr << "Failed to copy CUTLASS GEMM results: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }
result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
if (result != cudaSuccess) { std::cerr << "Failed to copy Reference GEMM results: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }
// // Free device memory allocations. //
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
// // Test for bit equivalence of results. //
if (host_cutlass != host_reference) { std::cerr << "CUTLASS results incorrect." << std::endl;
return cudaErrorUnknown; }
return cudaSuccess;}这个函数整理还是比较长的,不过注释写的很清楚,我们可以整理如下:
// // Define several matrices to be used as operands to GEMM kernels. //
// Compute leading dimensions for each matrix. int lda = M; int ldb = K; int ldc = M;
// Compute size in bytes of the C matrix. size_t sizeof_C = sizeof(float) * ldc * N;
// Define pointers to matrices in GPU device memory. float *A; float *B; float *C_cutlass; float *C_reference;这段就是去加载对应的参数,这里的 lda 等参数是数据分布相关的,下面的内存分配要用到,不过这里作为 NN 也就是列主序的,可以简单理解成就是矩阵的行。
接下来计算了下 size 和定义了很多指针,还是很常识的。
// // Allocate matrices in GPU device memory with arbitrary seeds. //
result = AllocateMatrix(&A, M, K, 0);
if (result != cudaSuccess) { return result; }
result = AllocateMatrix(&B, K, N, 17);
if (result != cudaSuccess) { cudaFree(A); return result; }
result = AllocateMatrix(&C_cutlass, M, N, 101);
if (result != cudaSuccess) { cudaFree(A); cudaFree(B); return result; }
result = AllocateMatrix(&C_reference, M, N, 101);
if (result != cudaSuccess) { cudaFree(A); cudaFree(B); cudaFree(C_cutlass); return result; }
result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
if (result != cudaSuccess) { std::cerr << "Failed to copy C_cutlass matrix to C_reference: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }接下来实现了一版矩阵的初始化,用的自定义的随机数生成。
这里的 AllocateMatrix 和 InitializeMatrix 是相当于封装了 cudaMalloc + cudaMemset + ((offset + seed) * 16807 % 16) 生成的随机数。
然后再保证两个 C matrix 的一致性。
// // Launch CUTLASS GEMM. //
result = CutlassSgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C_cutlass, ldc);
if (result != cudaSuccess) { std::cerr << "CUTLASS GEMM kernel failed: " << cudaGetErrorString(result) << std::endl;
cudaFree(C_reference); cudaFree(C_cutlass); cudaFree(B); cudaFree(A);
return result; }接下来调用了 cutlassSgemmNN 这里的 NN 代表都不转置,然后是 check err。
/////////////////////////////////////////////////////////////////////////////////////////////////////// This function defines a CUTLASS GEMM kernel instantiation, constructs its parameters object,// and launches it on the CUDA device./////////////////////////////////////////////////////////////////////////////////////////////////////
/// Define a CUTLASS GEMM template and launch a GEMM kernel.cudaError_t CutlassSgemmNN( int M, int N, int K, float alpha, float const *A, int lda, float const *B, int ldb, float beta, float *C, int ldc) {
// Define type definition for single-precision CUTLASS GEMM with column-major // input matrices and 128x128x8 threadblock tile size (chosen by default). // // To keep the interface manageable, several helpers are defined for plausible compositions // including the following example for single-precision GEMM. Typical values are used as // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details. // // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h`
using ColumnMajor = cutlass::layout::ColumnMajor;
using CutlassGemm = cutlass::gemm::device::Gemm<float, // Data-type of A matrix ColumnMajor, // Layout of A matrix float, // Data-type of B matrix ColumnMajor, // Layout of B matrix float, // Data-type of C matrix ColumnMajor>; // Layout of C matrix
// Define a CUTLASS GEMM type CutlassGemm gemm_operator;
// Construct the CUTLASS GEMM arguments object. // // One of CUTLASS's design patterns is to define gemm argument objects that are constructible // in host code and passed to kernels by value. These may include pointers, strides, scalars, // and other arguments needed by Gemm and its components. // // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible // arguments to kernels and (2.) minimized initialization overhead on kernel entry. // CutlassGemm::Arguments args({M , N, K}, // Gemm Problem dimensions {A, lda}, // Tensor-ref for source matrix A {B, ldb}, // Tensor-ref for source matrix B {C, ldc}, // Tensor-ref for source matrix C {C, ldc}, // Tensor-ref for destination matrix D (may be different memory than source C matrix) {alpha, beta}); // Scalars used in the Epilogue
// // Launch the CUTLASS GEMM kernel. //
cutlass::Status status = gemm_operator(args);
// // Return a cudaError_t if the CUTLASS GEMM operator returned an error code. //
if (status != cutlass::Status::kSuccess) { return cudaErrorUnknown; }
// Return success, if no errors were encountered. return cudaSuccess;}接下来是对 cutlass 简单 gemm 的一个封装,不过要注意的是:
- 用
using CutlassGemm = cutlass::gemm::device::Gemm<float,ColumnMajor,float,ColumnMajor,float,ColumnMajor定义了一下gemm的类型。 - 上面这个类有一个
Arguments的嵌套struct类型,这里面包括A、B、C、D四个matrix的相关参数。
内部类似
struct Arguments {
// // Data members //
GemmCoord problem_size; TensorRef<ElementA const, LayoutA> ref_A; TensorRef<ElementB const, LayoutB> ref_B; TensorRef<ElementC const, LayoutC> ref_C; TensorRef<ElementC, LayoutC> ref_D; typename EpilogueOutputOp::Params epilogue; int split_k_slices; // For gather+scatter operations int *gather_A_indices; int *gather_B_indices; int *scatter_D_indices;
// // Methods //
/// Default ctor CUTLASS_HOST_DEVICE Arguments() { }
/// Constructs an Arguments structure CUTLASS_HOST_DEVICE Arguments( GemmCoord problem_size_, TensorRef<ElementA const, LayoutA> ref_A_, TensorRef<ElementB const, LayoutB> ref_B_, TensorRef<ElementC const, LayoutC> ref_C_, TensorRef<ElementC, LayoutC> ref_D_, typename EpilogueOutputOp::Params epilogue_ = typename EpilogueOutputOp::Params(), int split_k_slices = 1, int *gather_A_indices_ = nullptr, int *gather_B_indices_ = nullptr, int *scatter_D_indices_ = nullptr ): problem_size(problem_size_), ref_A(ref_A_), ref_B(ref_B_), ref_C(ref_C_), ref_D(ref_D_), epilogue(epilogue_), split_k_slices(split_k_slices), gather_A_indices(gather_A_indices_), gather_B_indices(gather_B_indices_), scatter_D_indices(scatter_D_indices_) { } };这里相当于只填写了
1 GemmCoord problem_size; // 问题维度 {M, N, K}2 TensorRef<ElementA const, LayoutA> ref_A; // A矩阵引用 (只读)3 TensorRef<ElementB const, LayoutB> ref_B; // B矩阵引用 (只读)4 TensorRef<ElementC const, LayoutC> ref_C; // 输入C矩阵引用 (只读)5 TensorRef<ElementC, LayoutC> ref_D; // 输出D矩阵引用 (可写)这样就可以成功执行了,这相当于就是一个最简单的 GEMM 实现。