导言#

作为现在的高性能计算开发者，Nvidia 的生态是我们绕不开的一关，而 Nvidia 的 CUDA 编程模型虽然基于 SIMT 编写起来较为自然，但是随着 Tensorcore 这一类专用单元的增多，编写高性能的 kernel 的学习曲线变得越来越陡峭。为此，Nvidia 提供了 CUTLASS 这个仓库，辅助大家利用新的每一代的 GPU 架构。哪怕不利用 CUTLASS 编写，学习 CUTLASS 也是编写高性能内核不可或缺的一环。

这里借助本文对 CUTLASS 进行入门的学习。

快速上手#

我们需要预先安装好下面的依赖，一般装有较新 GPU 的服务器都有下面的依赖

NVIDIA CUDA Toolkit (11.4 or later required, 12.0 recommended)
CMake 3.18+
host compiler supporting C++17 or greater (minimum g++ 7.5.0)
Python 3.6+
cuBLAS
cuDNN v7.6 or later

这里使用的环境如下

1
(base) ➜  ~ python --version
2
Python 3.13.2
3
(base) ➜  ~ module load cuda/12.8
4
Loading cuda version 12.8
5
(base) ➜  ~ module load cudnn
6
Loading nvidia version 9.7.1

显卡是

一台四卡 3080ti 的机器，方便编写各种场景下的代码。

接下来配置 IDE，官网给出了非常友好的 CUDA 文件辅助配置 clangd 文件如下：

1
CompileFlags:
2
  Compiler: /usr/local/cuda-12.8/bin/nvcc
3
  Add:
4
    - --cuda-path=/usr/local/cuda-12.8
38 collapsed lines
5
    - --cuda-gpu-arch=sm_90a
6
    - -I/usr/local/cuda/include
7
    - "-xcuda"
8
#report all errors
9
    - "-ferror-limit=0"
10
    - --cuda-gpu-arch=sm_90a
11
    - --std=c++17
12
    - "-D__INTELLISENSE__"
13
    - "-D__CLANGD__"
14
    - "-DCUDA_12_0_SM90_FEATURES_SUPPORTED"
15
    - "-DCUTLASS_ARCH_MMA_SM90_SUPPORTED=1"
16
    - "-D_LIBCUDACXX_STD_VER=12"
17
    - "-D__CUDACC_VER_MAJOR__=12"
18
    - "-D__CUDACC_VER_MINOR__=3"
19
    - "-D__CUDA_ARCH__=900"
20
    - "-D__CUDA_ARCH_FEAT_SM90_ALL"
21
    - "-Wno-invalid-constexpr"
22
  Remove:
23
#strip CUDA fatbin args
24
    - "-Xfatbin*"
25
#strip CUDA arch flags
26
    - "-gencode*"
27
    - "--generate-code*"
28
#strip CUDA flags unknown to clang
29
    - "-ccbin*"
30
    - "--compiler-options*"
31
    - "--expt-extended-lambda"
32
    - "--expt-relaxed-constexpr"
33
    - "-forward-unknown-to-host-compiler"
34
    - "-Werror=cross-execution-space-call"
35
Hover:
36
  ShowAKA: No
37
InlayHints:
38
  Enabled: Yes
39
Diagnostics:
40
  Suppress:
41
    - "variadic_device_fn"
42
    - "attributes_not_allowed"

基本可以保证我们的跳转不犯病。

这里我们先最小时间编译构建支持计算能力为 86 的 CUTLASS。根据官网的教程是：

1
cmake -S . -B build -DCUTLASS_NVCC_ARCHS=86 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON

重点是开启了单元构建，方便我们只构建我们要的部分，然后编译 cutlass_profiler

1
cmake --build build -j12 --target cutlass_profiler

接下来等待编译即可。

这个时候我们已经编译成功我们的 cutlass_profiler。

Basic GEMM 学习#

1
int main(int argc, const char *arg[]) {
2

3
  //
4
  // Parse the command line to obtain GEMM dimensions and scalar values.
5
  //
6

7
  // GEMM problem dimensions.
8
  int problem[3] = { 128, 128, 128 };
9

10
  for (int i = 1; i < argc && i < 4; ++i) {
11
    std::stringstream ss(arg[i]);
12
    ss >> problem[i - 1];
13
  }
14

15
  // Scalars used for linear scaling the result of the matrix product.
16
  float scalars[2] = { 1, 0 };
17

18
  for (int i = 4; i < argc && i < 6; ++i) {
19
    std::stringstream ss(arg[i]);
20
    ss >> scalars[i - 4];
21
  }
22

23
  //
24
  // Run the CUTLASS GEMM test.
25
  //
26

27
  cudaError_t result = TestCutlassGemm(
28
    problem[0],     // GEMM M dimension
29
    problem[1],     // GEMM N dimension
30
    problem[2],     // GEMM K dimension
31
    scalars[0],     // alpha
32
    scalars[1]      // beta
33
  );
34

35
  if (result == cudaSuccess) {
36
    std::cout << "Passed." << std::endl;
37
  }
38

39
  // Exit.
40
  return result == cudaSuccess ? 0 : -1;
41
}

先看主函数，如上，分析的很清楚主要是封装了一下 cmd 的读取，主要是 size 和 scala 然后调用对应的测试函数，如果是 success 就返回 cudaSuccess。

1
cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
158 collapsed lines
2
  cudaError_t result;
3

4
  //
5
  // Define several matrices to be used as operands to GEMM kernels.
6
  //
7

8
  // Compute leading dimensions for each matrix.
9
  int lda = M;
10
  int ldb = K;
11
  int ldc = M;
12

13
  // Compute size in bytes of the C matrix.
14
  size_t sizeof_C = sizeof(float) * ldc * N;
15

16
  // Define pointers to matrices in GPU device memory.
17
  float *A;
18
  float *B;
19
  float *C_cutlass;
20
  float *C_reference;
21

22
  //
23
  // Allocate matrices in GPU device memory with arbitrary seeds.
24
  //
25

26
  result = AllocateMatrix(&A, M, K, 0);
27

28
  if (result !=  cudaSuccess) {
29
    return result;
30
  }
31

32
  result = AllocateMatrix(&B, K, N, 17);
33

34
  if (result !=  cudaSuccess) {
35
    cudaFree(A);
36
    return result;
37
  }
38

39
  result = AllocateMatrix(&C_cutlass, M, N, 101);
40

41
  if (result != cudaSuccess) {
42
    cudaFree(A);
43
    cudaFree(B);
44
    return result;
45
  }
46

47
  result = AllocateMatrix(&C_reference, M, N, 101);
48

49
  if (result != cudaSuccess) {
50
    cudaFree(A);
51
    cudaFree(B);
52
    cudaFree(C_cutlass);
53
    return result;
54
  }
55

56
  result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
57

58
  if (result != cudaSuccess) {
59
    std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
60
      << cudaGetErrorString(result) << std::endl;
61

62
    cudaFree(C_reference);
63
    cudaFree(C_cutlass);
64
    cudaFree(B);
65
    cudaFree(A);
66

67
    return result;
68
  }
69

70
  //
71
  // Launch CUTLASS GEMM.
72
  //
73

74
  result = CutlassSgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C_cutlass, ldc);
75

76
  if (result != cudaSuccess) {
77
    std::cerr << "CUTLASS GEMM kernel failed: "
78
      << cudaGetErrorString(result) << std::endl;
79

80
    cudaFree(C_reference);
81
    cudaFree(C_cutlass);
82
    cudaFree(B);
83
    cudaFree(A);
84

85
    return result;
86
  }
87

88
  //
89
  // Verify.
90
  //
91

92
  // Launch reference GEMM
93
  result = ReferenceGemm(M, N, K, alpha, A, lda, B, ldb, beta, C_reference, ldc);
94

95
  if (result != cudaSuccess) {
96
    std::cerr << "Reference GEMM kernel failed: "
97
      << cudaGetErrorString(result) << std::endl;
98

99
    cudaFree(C_reference);
100
    cudaFree(C_cutlass);
101
    cudaFree(B);
102
    cudaFree(A);
103

104
    return result;
105
  }
106

107
  // Copy to host and verify equivalence.
108
  std::vector<float> host_cutlass(ldc * N, 0);
109
  std::vector<float> host_reference(ldc * N, 0);
110

111
  result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
112

113
  if (result != cudaSuccess) {
114
    std::cerr << "Failed to copy CUTLASS GEMM results: "
115
      << cudaGetErrorString(result) << std::endl;
116

117
    cudaFree(C_reference);
118
    cudaFree(C_cutlass);
119
    cudaFree(B);
120
    cudaFree(A);
121

122
    return result;
123
  }
124

125
  result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
126

127
  if (result != cudaSuccess) {
128
    std::cerr << "Failed to copy Reference GEMM results: "
129
      << cudaGetErrorString(result) << std::endl;
130

131
    cudaFree(C_reference);
132
    cudaFree(C_cutlass);
133
    cudaFree(B);
134
    cudaFree(A);
135

136
    return result;
137
  }
138

139
  //
140
  // Free device memory allocations.
141
  //
142

143
  cudaFree(C_reference);
144
  cudaFree(C_cutlass);
145
  cudaFree(B);
146
  cudaFree(A);
147

148
  //
149
  // Test for bit equivalence of results.
150
  //
151

152
  if (host_cutlass != host_reference) {
153
    std::cerr << "CUTLASS results incorrect." << std::endl;
154

155
    return cudaErrorUnknown;
156
  }
157

158
  return cudaSuccess;
159
}

这个函数整理还是比较长的，不过注释写的很清楚，我们可以整理如下：

1
  //
2
  // Define several matrices to be used as operands to GEMM kernels.
3
  //
4

5
  // Compute leading dimensions for each matrix.
6
  int lda = M;
7
  int ldb = K;
8
  int ldc = M;
9

10
  // Compute size in bytes of the C matrix.
11
  size_t sizeof_C = sizeof(float) * ldc * N;
12

13
  // Define pointers to matrices in GPU device memory.
14
  float *A;
15
  float *B;
16
  float *C_cutlass;
17
  float *C_reference;

这段就是去加载对应的参数，这里的 lda 等参数是数据分布相关的，下面的内存分配要用到，不过这里作为 NN 也就是列主序的，可以简单理解成就是矩阵的行。

接下来计算了下 size 和定义了很多指针，还是很常识的。

1
  //
2
  // Allocate matrices in GPU device memory with arbitrary seeds.
3
  //
4

5
  result = AllocateMatrix(&A, M, K, 0);
6

7
  if (result !=  cudaSuccess) {
8
    return result;
9
  }
10

11
  result = AllocateMatrix(&B, K, N, 17);
12

13
  if (result !=  cudaSuccess) {
14
    cudaFree(A);
15
    return result;
16
  }
17

18
  result = AllocateMatrix(&C_cutlass, M, N, 101);
19

20
  if (result != cudaSuccess) {
21
    cudaFree(A);
22
    cudaFree(B);
23
    return result;
24
  }
25

26
  result = AllocateMatrix(&C_reference, M, N, 101);
27

28
  if (result != cudaSuccess) {
29
    cudaFree(A);
30
    cudaFree(B);
31
    cudaFree(C_cutlass);
32
    return result;
33
  }
34

35
  result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
36

37
  if (result != cudaSuccess) {
38
    std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
39
      << cudaGetErrorString(result) << std::endl;
40

41
    cudaFree(C_reference);
42
    cudaFree(C_cutlass);
43
    cudaFree(B);
44
    cudaFree(A);
45

46
    return result;
47
  }

接下来实现了一版矩阵的初始化，用的自定义的随机数生成。

这里的 AllocateMatrix 和 InitializeMatrix 是相当于封装了 cudaMalloc + cudaMemset + ((offset + seed) * 16807 % 16) 生成的随机数。

然后再保证两个 C matrix 的一致性。

1
  //
2
  // Launch CUTLASS GEMM.
3
  //
4

5
  result = CutlassSgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C_cutlass, ldc);
6

7
  if (result != cudaSuccess) {
8
    std::cerr << "CUTLASS GEMM kernel failed: "
9
      << cudaGetErrorString(result) << std::endl;
10

11
    cudaFree(C_reference);
12
    cudaFree(C_cutlass);
13
    cudaFree(B);
14
    cudaFree(A);
15

16
    return result;
17
  }

接下来调用了 cutlassSgemmNN 这里的 NN 代表都不转置，然后是 check err。

1
///////////////////////////////////////////////////////////////////////////////////////////////////
2
//
3
// This function defines a CUTLASS GEMM kernel instantiation, constructs its parameters object,
4
// and launches it on the CUDA device.
5
//
6
///////////////////////////////////////////////////////////////////////////////////////////////////
7

8
/// Define a CUTLASS GEMM template and launch a GEMM kernel.
9
cudaError_t CutlassSgemmNN(
10
  int M,
11
  int N,
12
  int K,
13
  float alpha,
14
  float const *A,
15
  int lda,
16
  float const *B,
17
  int ldb,
18
  float beta,
19
  float *C,
20
  int ldc) {
21

22
  // Define type definition for single-precision CUTLASS GEMM with column-major
23
  // input matrices and 128x128x8 threadblock tile size (chosen by default).
24
  //
25
  // To keep the interface manageable, several helpers are defined for plausible compositions
26
  // including the following example for single-precision GEMM. Typical values are used as
27
  // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details.
28
  //
29
  // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h`
30

31
  using ColumnMajor = cutlass::layout::ColumnMajor;
32

33
  using CutlassGemm = cutlass::gemm::device::Gemm<float,        // Data-type of A matrix
34
                                                  ColumnMajor,  // Layout of A matrix
35
                                                  float,        // Data-type of B matrix
36
                                                  ColumnMajor,  // Layout of B matrix
37
                                                  float,        // Data-type of C matrix
38
                                                  ColumnMajor>; // Layout of C matrix
39

40
  // Define a CUTLASS GEMM type
41
  CutlassGemm gemm_operator;
42

43
  // Construct the CUTLASS GEMM arguments object.
44
  //
45
  // One of CUTLASS's design patterns is to define gemm argument objects that are constructible
46
  // in host code and passed to kernels by value. These may include pointers, strides, scalars,
47
  // and other arguments needed by Gemm and its components.
48
  //
49
  // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
50
  // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
51
  //
52
  CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
53
                              {A, lda},    // Tensor-ref for source matrix A
54
                              {B, ldb},    // Tensor-ref for source matrix B
55
                              {C, ldc},    // Tensor-ref for source matrix C
56
                              {C, ldc},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
57
                              {alpha, beta}); // Scalars used in the Epilogue
58

59
  //
60
  // Launch the CUTLASS GEMM kernel.
61
  //
62

63
  cutlass::Status status = gemm_operator(args);
64

65
  //
66
  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
67
  //
68

69
  if (status != cutlass::Status::kSuccess) {
70
    return cudaErrorUnknown;
71
  }
72

73
  // Return success, if no errors were encountered.
74
  return cudaSuccess;
75
}

接下来是对 cutlass 简单 gemm 的一个封装，不过要注意的是：

用 using CutlassGemm = cutlass::gemm::device::Gemm<float,ColumnMajor,float,ColumnMajor,float,ColumnMajor 定义了一下 gemm 的类型。
上面这个类有一个 Arguments 的嵌套 struct 类型，这里面包括 A、B、C、D 四个 matrix 的相关参数。

内部类似

1
  struct Arguments {
2

3
    //
4
    // Data members
5
    //
6

7
    GemmCoord problem_size;
8
    TensorRef<ElementA const, LayoutA> ref_A;
9
    TensorRef<ElementB const, LayoutB> ref_B;
10
    TensorRef<ElementC const, LayoutC> ref_C;
11
    TensorRef<ElementC, LayoutC> ref_D;
12
    typename EpilogueOutputOp::Params epilogue;
13
    int split_k_slices;
14
    // For gather+scatter operations
15
    int *gather_A_indices;
16
    int *gather_B_indices;
17
    int *scatter_D_indices;
18

19
    //
20
    // Methods
21
    //
22

23
    /// Default ctor
24
    CUTLASS_HOST_DEVICE
25
    Arguments() { }
26

27
    /// Constructs an Arguments structure
28
    CUTLASS_HOST_DEVICE
29
    Arguments(
30
      GemmCoord problem_size_,
31
      TensorRef<ElementA const, LayoutA> ref_A_,
32
      TensorRef<ElementB const, LayoutB> ref_B_,
33
      TensorRef<ElementC const, LayoutC> ref_C_,
34
      TensorRef<ElementC, LayoutC> ref_D_,
35
      typename EpilogueOutputOp::Params epilogue_ =
36
        typename EpilogueOutputOp::Params(),
37
      int split_k_slices = 1,
38
      int *gather_A_indices_ = nullptr,
39
      int *gather_B_indices_ = nullptr,
40
      int *scatter_D_indices_ = nullptr
41
    ):
42
      problem_size(problem_size_),
43
      ref_A(ref_A_),
44
      ref_B(ref_B_),
45
      ref_C(ref_C_),
46
      ref_D(ref_D_),
47
      epilogue(epilogue_),
48
      split_k_slices(split_k_slices),
49
      gather_A_indices(gather_A_indices_),
50
      gather_B_indices(gather_B_indices_),
51
      scatter_D_indices(scatter_D_indices_) { }
52
  };

这里相当于只填写了

1
1 GemmCoord problem_size;                    // 问题维度 {M, N, K}
2
2 TensorRef<ElementA const, LayoutA> ref_A;  // A矩阵引用 (只读)
3
3 TensorRef<ElementB const, LayoutB> ref_B;  // B矩阵引用 (只读)
4
4 TensorRef<ElementC const, LayoutC> ref_C;  // 输入C矩阵引用 (只读)
5
5 TensorRef<ElementC, LayoutC> ref_D;        // 输出D矩阵引用 (可写)

这样就可以成功执行了，这相当于就是一个最简单的 GEMM 实现。