# Adapted straight from https://triton-lang.org/main/getting-started/tutorials/02-fused-softmax.html import triton import triton.language as tl import torch
# if @triton.jit(interpret=True) does not work, please use the following two lines to enable interpret mode # import os # os.environ["TRITON_INTERPRET"] = "1"
@triton.jit defsquare_kernel(output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, BLOCK_SIZE: tl.constexpr): # The rows of the softmax are independent, so we parallelize across those row_idx = tl.program_id(0) # The stride represents how much we need to increase the pointer to advance 1 row row_start_ptr = input_ptr + row_idx * input_row_stride # The block size is the next power of two greater than n_cols, so we can fit each # row in a single block col_offsets = tl.arange(0, BLOCK_SIZE) input_ptrs = row_start_ptr + col_offsets # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols row = tl.load(input_ptrs, mask=col_offsets 'inf'))
square_output = row * row
# Write back output to DRAM output_row_start_ptr = output_ptr + row_idx * output_row_stride output_ptrs = output_row_start_ptr + col_offsets tl.store(output_ptrs, square_output, mask=col_offsets
defsquare(x): n_rows, n_cols = x.shape # The block size is the smallest power of two greater than the number of columns in `x` BLOCK_SIZE = triton.next_power_of_2(n_cols) # Another trick we can use is to ask the compiler to use more threads per row by # increasing the number of warps (`num_warps`) over which each row is distributed. # You will see in the next tutorial how to auto-tune this value in a more natural # way so you don't have to come up with manual heuristics yourself. num_warps = 4 if BLOCK_SIZE >= 2048: num_warps = 8 if BLOCK_SIZE >= 4096: num_warps = 16 # Allocate output y = torch.empty_like(x) # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o # f the input matrix square_kernel[(n_rows, )]( y, x, x.stride(0), y.stride(0), n_cols, num_warps=num_warps, BLOCK_SIZE=BLOCK_SIZE, ) return y
首先是 GPU Speed Of Light Throughput部分,它通常位于Details部分的顶部。它清晰的描述了GPU资源的利用情况。在下面的截图中,我们同样可以通过鼠标悬停的方式去看每个指标的细节,这里就不再赘述了。
从这个结果可以看出:
内存吞吐量(83.56%)远高于计算吞吐量(15.55%),表明这可能是一个内存密集型任务。
L1/TEX和L2缓存吞吐量相对较低,可能存在优化空间。
DRAM吞吐量与总体内存吞吐量相同,说明主要的内存操作直接与DRAM交互。
Memory Workload Analysis 部分
从上到下对每个部分解析一下:
顶部性能指标
Detailed analysis of the memory resources of the GPU. Memory can become a limiting factor for the overall kernel performance when fully utilizing the involved hardware units (Mem Busy), exhausting the available communication bandwidth between those units (Max Bandwidth), or by reaching the maximum throughput of issuing memory instructions (Mem Pipes Busy). Detailed chart of the memory units. Detailed tables with data for each memory unit.