# We'll use several small numbers that, when added together first, could show a difference numbers = [1e-20] * 10 + [1e20, -1e20] # 10 small numbers followed by a large positive and negative number
# Sum the list from left to right sum_left_to_right_adjusted = sum(numbers)
# Sum the list from right to left sum_right_to_left_adjusted = sum(reversed(numbers))
另外想说明的问题是在《CUDA-MODE课程笔记 第7课: Quantization Cuda vs Triton》讲到过即使是进行INT4/INT8量化时,累加运算往往在更高的精度下运行,其原因在于如果你在float16中累加许多小的数值,最后可能会出现大数吃小数的情况。解决方法有两种:要么使用bf16这种具有更高动态范围的数据类型,要么在float32高精度上进行累加。例如当我们查看Triton矩阵乘法的教程时,我们会发现他的累加器一般都是float32。这个例子的代码为:https://github.com/cuda-mode/lectures/blob/main/lecture_009/accuracy.py
import torch large_value = torch.tensor([1000.0], dtype=torch.float32) # Using float32 for initial value
# Define a smaller value that is significant for float32 but not for float16 small_value = torch.tensor([1e-3], dtype=torch.float32) # Small value in float32
# Add small value to large value in float32 result_float32 = large_value + small_value
# Convert large value to float16 and add the small value (also converted to float16) result_float16 = large_value.to(torch.float16) + small_value.to(torch.float16)
# Convert results back to float32 for accurate comparison result_float32 = result_float32.item() result_float16_converted = result_float16.to(torch.float32).item()
__global__ voidFixDivergenceKernel(float* input, float* output){ unsignedint i = threadIdx.x; //threads start next to each other for (unsignedint stride = blockDim.x; stride >= 1; stride /= 2) { // furthest element is blockDim away if (threadIdx.x // input[i] += input[i + stride]; // each thread adds a distant element to its assigned position } __syncthreads();
// This is the code from the book but I couldn't get this to run faster even with occupancy calculator // L1 throughput is dramatically increased though __global__ voidSharedMemoryReduction(float* input, float* output){ __shared__ float input_s[BLOCK_DIM]; unsignedint t = threadIdx.x; input_s[t] = input[t] + input[t + BLOCK_DIM]; for (unsignedint stride = blockDim.x/2; stride >= 1; stride /=2) { __syncthreads(); if (threadIdx.x input_s[t] += input_s[t + stride]; } }
__global__ voidSharedMemoryReduction(float* input, float* output, int n){ __shared__ float input_s[BLOCK_DIM]; unsignedint idx = blockIdx.x * blockDim.x + threadIdx.x; // index within a block unsignedint t = threadIdx.x; // global index
// Load elements into shared memory if (idx input_s[t] = input[idx]; } else { input_s[t] = 0.0f; } __syncthreads();
// Reduction in shared memory for (unsignedint stride = blockDim.x / 2; stride > 0; stride >>= 1) { if (t input_s[t] += input_s[t + stride]; } __syncthreads(); }
// Reduction across blocks in global memory // needs to be atomic to avoid contention if (t == 0) { atomicAdd(output, input_s[0]); } }
intmain(){ // Size of the input data constint size = 100000; constint bytes = size * sizeof(float);
// Allocate memory for input and output on host float* h_input = newfloat[size]; float* h_output = newfloat;
// Initialize input data on host for (int i = 0; i h_input[i] = 1.0f; // Example: Initialize all elements to 1 }
// Allocate memory for input and output on device float* d_input; float* d_output;
// Copy data from host to device float zero = 0.0f; cudaMemcpy(d_output, &zero, sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice);
// Launch the kernel int numBlocks = (size + BLOCK_DIM - 1) / BLOCK_DIM; SharedMemoryReduction<<>>(d_input, d_output, size);
// Copy result back to host cudaMemcpy(h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
LogicalResult matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter)constoverride{ ReduceOpHelper helper(op); assert(helper.isSupportedLayout() && "Unexpected srcLayout in ReduceOpConversion"); Location loc = op->getLoc();
auto srcValues = unpackInputs(loc, op, adaptor, rewriter); std::mapunsigned>, SmallVector> accs; std::mapunsigned>, SmallVector> indices; // First reduce all the values along axis within each thread. reduceWithinThreads(helper, srcValues, accs, indices, rewriter);
// Then reduce across threads within a warp. reduceWithinWarps(helper, accs, rewriter);
if (helper.isWarpSynchronous()) { // If all the values to be reduced are within the same warp there is // nothing left to do. packResults(helper, accs, rewriter); return success(); }
// Compute a shared memory base per operand. auto smemShape = helper.getScratchRepShape();
// The second round of shuffle reduction // now the problem size: sizeInterWarps, s1, s2, .. , sn // where sizeInterWarps is 2^m // // Each thread needs to process: // elemsPerThread = sizeInterWarps * s1 * s2 .. Sn / numThreads accumulatePartialReductions(helper, smemBases, rewriter);
// We could avoid this barrier in some of the layouts, however this is not // the general case. // TODO: optimize the barrier in case the layouts are accepted. sync(rewriter, loc, op);
// set output values loadReductionAndPackResult(helper, smemShape, smemBases, rewriter);