专栏名称: NVIDIA企业开发者社区

NVIDIA 英伟达开发者社区是NVIDIA GPU开发者交流平台，通过此平台可第一时间获取NVIDIA GPU 开发相关的新产品、新工具、线上/线下活动的资讯。

技术博客 | 使用 NVIDIA CUDA Toolkit 12.4 编译器创建运行时 Fatbin

NVIDIA企业开发者社区 · 公众号 · · 2024-07-10 18:32

正文

CUDA Toolkit 12.4 引入了一个新的 nvFatbin 库，用于在运行时创建 fatbins。fatbins，也称为 NVIDIA 设备代码 fat 二进制文件 sm_61 和 sm_90 。

到目前为止，要生成 fatbin，必须依赖命令行工具 fatbinary ，这不适合动态代码生成。这使得动态生成 fatbins 变得困难，因为您需要将生成的代码放入一个文件中，然后使用 exec 或类似命令调用 fatbinary ，并处理输出，这显著增加了动态生成 fatbins 的难度，并导致多次尝试通过各种容器模仿 fatbins。

CUDA Toolkit 12.4 引入了 nvFatbin，这是一个新的库，能够通过编程创建 fatbin，从而大大简化了这项任务，不再需要写入文件、调用 exec 、解析命令行输出和从目录中获取输出文件。

新库提供了运行时 fatbin 创建支持

使用 nvFatbin 库类似于任何其他熟悉的库，如 NVRTC 、nvPTXCompiler 和 nvJitLink。nvFatbin 库有静态和动态版本，适用于所有平台，这些平台都随 nvrtc 提供。

经过适当考虑，通过 nvFatbin 库创建的 Fatbin 符合 CUDA 兼容性保证。本文主要涵盖通过 nvFatbin 库的运行时 fatbin 创建，并在适当的时候强调与现有命令行 fatbinary 的差异。我们将通过代码示例、兼容性保证和优点深入了解该功能的细节。作为额外福利，我们还提供了 NVIDIA TensorRT 计划如何以及为什么利用该功能的预览。

图 1：与使用 nvFatbin 库相比，现有的 fatbinary 命令行工具

如何使运行时 fatbin 创建正常工作

创建稍后要引用的句柄，以便将相关的设备代码插入到 fatbinary 中。

nvFatbinCreate(&handle, numOptions, options);

使用取决于输入类型的函数，添加要放入 fatbin 的设备代码。

nvFatbinAddCubin(handle, data, size, arch, name);

nvFatbinAddPTX(handle, data, size, arch, name, ptxOptions);

nvFatbinAddLTOIR(handle, data, size, arch, name, ltoirOptions);

对于 PTX 和 LTO-IR （一种用于 JIT LTO 的中间表示形式），请在此处指定在 JIT 编译期间使用的其他选项。

检索得到的 fatbin。为此，显式分配一个缓冲区。执行此操作时，请确保查询生成的 fatbin 的大小，以确保分配了足够的空间。

nvFatbinSize(linker, &fatbinSize);

void * fatbin = malloc (fatbinSize);

nvFatbinGet(handle, fatbin);

清理句柄

nvFatbinDestroy(&handle);

使用 NVCC 离线生成 fatbins

要使用 NVCC 离线生成一个 fatbin，请添加选项 -fatbin 。例如，给定文件 loader.cu ，以下命令将生成一个 fatbin，其中包含一个用于 sm_90 的条目，该条目包含代码的 LTO-IR 版本，名为 loader.fatbin 。

nvcc -arch lto_90 -fatbin loader.cu

如果指定 -arch=sm_90 ，nvcc 将创建一个 fatbin，该 fatbin 同时包含 PTX 和 CUBIN（SASS）。该对象包含特定于 sm_90 的 SASS 指令和 PTX，以后可以对任何架构 >=90 进行 JIT。

nvcc -arch sm_90 -fatbin loader.cu

要创建具有多个条目的 fatbin，请使用指定多个体系结构 -gencode :

nvcc -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_52,code=compute_52

这将创建一个包含 sm_80 ELF , sm_90 ELF 和 compute_52 PTX 。您可以使用 cuobjdump 查看 fatbin 的内容。

在运行时生成 fatbins

除了前面描述的离线编译和运行时 fatbin 创建模型（图 1）外，还可以在运行时完全构建 fatbin，方法是使用 NVRTC 生成对象代码，然后使用 nvFatbin API 将它们添加到 fatbin。以下代码示例对使用 nvFatbin API 进行了相关修改。

#include

#define NUM_THREADS 128

#define NUM_BLOCKS 32

#define NVRTC_SAFE_CALL(x) \

do { \

nvrtcResult result = x; \

if (result != NVRTC_SUCCESS) { \

std::cerr << "\nerror: " #x " failed with error " \

<< nvrtcGetErrorString(result) << '\n' ; \

exit (1); \

} \

} while (0)

#define CUDA_SAFE_CALL(x) \

do { \

CUresult result = x; \

if (result != CUDA_SUCCESS) { \

const char *msg; \

cuGetErrorName(result, &msg); \

std::cerr << "\nerror: " #x " failed with error " \

<< msg << '\n' ; \

exit (1); \

} \

} while (0)

#define NVFATBIN_SAFE_CALL(x) \

do \

{ \

nvFatbinResult result = x; \

if (result != NVFATBIN_SUCCESS) \

{ \

std::cerr << "\nerror: " #x " failed with error " \

<< nvFatbinGetErrorString(result) << '\n' ;\

exit (1); \

} \

} while (0)

const char *fatbin_saxpy = " \n\

__device__ float compute( float a, float x, float y) { \n\

return a * x + y; \n\

} \n\

\n\

extern \"C\" __global__ \n\

void saxpy( float a, float *x, float *y, float *out, size_t n) \n\

{ \n\

size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\

if (tid < n) { \n\

out[tid] = compute(a, x[tid], y[tid]); \n\

} \n\

} \n";

size_t process( const void * input, const char * input_name, void ** output, const char * arch)

{

// Create an instance of nvrtcProgram with the code string.

nvrtcProgram prog;

NVRTC_SAFE_CALL(

nvrtcCreateProgram(&prog, // prog

( const char *) input, // buffer

input_name, // name

0, // numHeaders

NULL, // headers

NULL)); // includeNames

// specify that LTO IR should be generated for LTO operation

const char *opts[1];

opts[0] = arch;

nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog

1, // numOptions

opts); // options

// Obtain compilation log from the program.

size_t logSize;

NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));

char * log = new char [logSize];

NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log ));

std::cout << log << '\n' ;

delete [] log ;

if (compileResult != NVRTC_SUCCESS) {

exit (1);

}

// Obtain generated CUBIN from the program.

size_t CUBINSize;

NVRTC_SAFE_CALL(nvrtcGetCUBINSize(prog, &CUBINSize));

char *CUBIN = new char [CUBINSize];

NVRTC_SAFE_CALL(nvrtcGetCUBIN(prog, CUBIN));

// Destroy the program.

NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));

*output = ( void *) CUBIN;

return CUBINSize;

}

int main( int argc, char *argv[])

{

void * known = NULL;

size_t known_size = process(fatbin_saxpy, "fatbin_saxpy.cu" , &known, "-arch=sm_52" );

CUdevice cuDevice;

CUcontext context;

CUmodule module;

CUfunction kernel;

CUDA_SAFE_CALL(cuInit(0));

CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));

CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));

// Dynamically determine the arch to make one of the entries of the fatbin with

int major = 0;

int minor = 0;

CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,

CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));

CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,

CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));

int arch = major*10 + minor;

char smbuf[16];

sprintf (smbuf, "-arch=sm_%d" , arch);

void * dynamic = NULL;

size_t dynamic_size = process(fatbin_saxpy, "fatbin_saxpy.cu" , &dynamic, smbuf);

sprintf (smbuf, "%d" , arch);

// Load the dynamic CUBIN and the statically known arch CUBIN

// and put them in a fatbin together.

nvFatbinHandle handle;

const char * fatbin_options[] = { "-cuda" };

NVFATBIN_SAFE_CALL(nvFatbinCreate(&handle, fatbin_options, 1));

NVFATBIN_SAFE_CALL(nvFatbinAddCubin(handle,

( void *)dynamic, dynamic_size, smbuf, "dynamic" ));

NVFATBIN_SAFE_CALL(nvFatbinAddCubin(handle,

( void *)known, known_size, "52" , "known" ));

size_t fatbinSize;

NVFATBIN_SAFE_CALL(nvFatbinSize(handle, &fatbinSize));

void *fatbin = malloc (fatbinSize);

NVFATBIN_SAFE_CALL(nvFatbinGet(handle, fatbin));

NVFATBIN_SAFE_CALL(nvFatbinDestroy(&handle));

CUDA_SAFE_CALL(cuModuleLoadData(&module, fatbin));

CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy" ));

// Generate input for execution, and create output buffers.

#define NUM_THREADS 128

#define NUM_BLOCKS 32

size_t n = NUM_THREADS * NUM_BLOCKS;

size_t bufferSize = n * sizeof ( float );

技术博客 | 使用 NVIDIA CUDA Toolkit 12.4 编译器创建运行时 Fatbin

正文

请到「今天看啥」查看全文