我们暂定一个线程块的线程数为 256(tile_size),因为双调排序中每次需要进行一次比较来判断是否需要交换,因此每一个线程块处理连续的tile_size x 2个数据(以下称为一个数据段),每个线程负责一次比较操作。因此该核函数线程块大小为tile_size,网格大小为总共的数据段数量。
#define tile_size 256u #define log_tile_size 8u
unsigned int b = n + (tile_size <> log_tile_size + 1, s = tile_size > 1>>>(data, n);
首先先将需要的数据载入共享内存中,由于每一个线程块处理tile_size x 2个数据,所以一个线程块负责的数据段起始为“线程块索引 x tile_size x 2”;每一个线程将相隔tile_size的两个数据载入共享内存中,超过数据规模的部分直接填充浮点数的最大值,这样一个线程束访问一段连续的地址以达到更高的访存效率;还需添加一个线程块内同步函数,确保所有数据都已载入共享内存后再开始排序。
__shared__ double buffer[tile_size < unsigned int d = blockIdx.x buffer[i] = i buffer[i + blockDim.x] = i + blockDim.x __syncthreads();
unsigned int s, t; for (unsigned int k = 2; k <= blockDim.x; k <<= 1) for (unsigned int p = k >> 1; p; p >>= 1) { s = (i & -p) < t = s | p;
if (s & k ? buffer[s] buffer[t]) swap(buffer[s], buffer[t]); __syncthreads(); }
注:排序过程中的顺序与需求相关,若为升序排序则全为升序,反之则全为降序
该过程转化为代码的操作与上述合并过程中的内循环完全一致。
for (unsigned int p = blockDim.x; p; p >>= 1) { s = (i & -p) < t = s | p; if (buffer[s] > buffer[t]) swap(buffer[s], buffer[t]); __syncthreads(); }
最后将该数据段的有效排序结果写回全局内存。
if (i data[i] = buffer[i]; if (i + blockDim.x data[i + blockDim.x] = buffer[i + blockDim.x];
最终分块双调排序的代码如下:
#include
// binotic sort __global__ void binoticsort_gpu(double* data, unsigned int n) { __shared__ double buffer[tile_size < unsigned int d = blockIdx.x < data += d; n -= d; buffer[i] = i buffer[i + blockDim.x] = i + blockDim.x __syncthreads();
unsigned int s, t; for (unsigned int k = 2; k <= blockDim.x; k <<= 1) for (unsigned int p = k >> 1; p; p >>= 1) { s = (i & -p) < t = s | p;
if (s & k ? buffer[s] > buffer[t] : buffer[s] swap(buffer[s], buffer[t]); __syncthreads(); } for (unsigned int p = blockDim.x; p; p >>= 1) { s = (i & -p) < t = s | p; if (buffer[s] > buffer[t]) swap(buffer[s], buffer[t]); __syncthreads(); }
if (i data[i] = buffer[i]; if (i + blockDim.x data[i + blockDim.x] = buffer[i + blockDim.x]; }
经过该分块双调排序后,规模为n的数据被划分为多个长为 tile_size x 2的数据段,每个数据段内的 tile_size x 2个数据已经被成功排序,下面只需要逐步将这些有序数据段合并即可。
// auxiliary functions templatetype>__device__ unsigned int lower_bound(const type* data, unsigned int n, const type& value) { unsigned int len = n, half; const type* start = data, * mid; while (len) { half = len >> 1; mid = data + half; if (*mid { data = mid + 1; len = len - half - 1; } else len = half; } return data - start; } templatetype>__device__ unsigned int upper_bound(const type* data, unsigned int n, const type& value) { unsigned int len = n, half; const type* start = data, *mid; while (len) { half = len >> 1; mid = data + half; if (value len = half; else { data = mid + 1; len = len - half - 1; } } return data - start; }
mergedirect_gpu<<> log_tile_size, std::min(b, 32768u), b + 32767 >> 15), tile_size>>>(data, n, s, tmp);
// merge directly __global__ void mergedirect_gpu(const double* data, unsigned int n, unsigned int s, double* result) { unsigned int d1 = (blockIdx.z * gridDim.y + blockIdx.y) * s < if (i result[d2 if (s + i result[i + upper_bound(data + d1, s, data[s + i])] = data[s + i]; }
然后将这两个函数整合成一个排序函数即可,该函数接受两个参数:指向数据首地址的指针和数据规模,返回指向已排序数据首地址的指针,其中的指针均指向设备端。该函数首先需要申请一块与输入数据等长的临时数组用于存放合并的结果,然后先进行一次双调排序将输入数据分块排序为多个长为 tile_size x 2的有序数据段,再来一个循环将这些有序数据段逐轮合并直至数据段长度大于数据规模,最后释放临时数组空间并返回结果。
double* sortdirect_gpu(double* data, unsigned int n) { double* tmp; cudaMalloc(&tmp, sizeof(double) * n); unsigned int b = n + (tile_size <> log_tile_size + 1, s = tile_size < binoticsort_gpu<<> 1>>>(data, n); for (b = b + 1 >> 1; s > 1) { mergedirect_gpu<<> log_tile_size, std::min(b, 32768u), b + 32767 >> 15), tile_size>>>(data, n, s, tmp); std::swap(data, tmp); } cudaDeviceSynchronize(); cudaFree(tmp);
int main(int argc, char** argv) { cudaSetDevice(0); cudaFree(0);
unsigned int test[] = { 1025u, 65537u, 1048577u, 16777217u, 134217729u, 1073741825u }; const unsigned int runtime = 3;
std::chrono::high_resolution_clock::time_point start, end; unsigned long long elasped[3] = { 0ull };
for (unsigned int t = 0; t test) / sizeof(unsigned int); ++t) { unsigned int n = test[t]; double* original = new double[n]; double* data = new double[n];
elasped[0] = elasped[1] = elasped[2] = 0ull; srand(time(nullptr)); for (int i = 0; i { for (unsigned int i = 0; i original[i] = data[i] = (rand() * 2.0 / RAND_MAX - 1.0) * rand();