CUDA是NVIDIA公司推出的基于GPU的通用计算工具。GPU并行计算是利用GPU中的简化计算核心SP(流处理器)来进行计算,由于单个GPU中集成了大量SP,CUDA通过利用这些SP协同计算,实现较高的加速比。由于CUDA会自行将计算线程映射到SP,而且线程越多CUDA的并行效率越高。所以基于GPU的Jacobi并行采取了完全并行,即同一迭代步内,对所有网格的计算都事实并行。另外,对于迭代的结束条件也进行了并行优化。在串行算法中,迭代结束条件即是对前后两次迭代结果的差值求最大值。在GPU并行时,采用了并行的求最值算法,使得收敛判断(即迭代结束条件)也得到了很好的并行。
GPU(CUDA)代码如下:
//迭代计算CUDA代码
__global__ void kernelJacobiIteration(const int n, double *m, double *w)
{
int id = blockIdx.x * blockDim.x +threadIdx.x;
if (id < (n - 2) * (n - 2))
{
int row = id / (n - 2);
int column = id - row * (n - 2);
int location = (row + 1) * n + (column + 1);
w[location] = (m[location - 1] + m[location - n]
+ m[location + 1] + m[location + n]) / 4.0;
}
}
//收敛判断CUDA代码
//kernel of getting epsilon between d_m and d_w
__global__ void kernelGetEpsilon(const int n,
double *m, double *w,
double *ep)
{
int id = blockIdx.x * blockDim.x +threadIdx.x;
if (id < (n - 2) * (n - 2))
{
int row = id / (n - 2);
int column = id - row * (n - 2);
int location = (row + 1) * n + (column + 1);
ep[id] = fabs(m[location] - w[location]);
}
}
//kernel of max of a num group
__global__ void kernelGetMax(const int count, double *ep)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < count)
if (ep[id] < ep[id + count]) ep[id] = ep[id + count];
}
__global__ void kernelJacobiIteration(const int n, double *m, double *w)
{
int id = blockIdx.x * blockDim.x +threadIdx.x;
if (id < (n - 2) * (n - 2))
{
int row = id / (n - 2);
int column = id - row * (n - 2);
int location = (row + 1) * n + (column + 1);
w[location] = (m[location - 1] + m[location - n]
+ m[location + 1] + m[location + n]) / 4.0;
}
}
//收敛判断CUDA代码
//kernel of getting epsilon between d_m and d_w
__global__ void kernelGetEpsilon(const int n,
double *m, double *w,
double *ep)
{
int id = blockIdx.x * blockDim.x +threadIdx.x;
if (id < (n - 2) * (n - 2))
{
int row = id / (n - 2);
int column = id - row * (n - 2);
int location = (row + 1) * n + (column + 1);
ep[id] = fabs(m[location] - w[location]);
}
}
//kernel of max of a num group
__global__ void kernelGetMax(const int count, double *ep)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < count)
if (ep[id] < ep[id + count]) ep[id] = ep[id + count];
}