🤫Grid-Stride Loops
翻译自:https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
void saxpy(int n, float a, float *x, float *y)
{
for (int i = 0; i < n; ++i)
y[i] = a * x[i] + y[i];
}__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}// 处理 100 万个元素的 SAXPY
saxpy<<<4096,256>>>(1<<20, 2.0, x, y);__global__
void saxpy(int n, float a, float *x, float *y)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < n;
i += blockDim.x * gridDim.x)
{
y[i] = a * x[i] + y[i];
}
}示例代码
Last updated