本文使用了MPI和CUDA两种方法分别进行并行计算。两种并行方法的并行策略稍有不同。MPI是适用于计算机集群的并行计算方法。在使用MPI实现并行计算时的策略如上所述。为了提高PI的并行效率,降低数据传输引起的时间损耗,具体实现时适用了异步传输函数。
while(totalCount < countGoal)
{
step++;
//Synchronize boundary data
if (nodeIndex % 2 == 0)
{
//Next
if (nodeIndex != nodeCount - 1)
{
MPI_Isend(m + jobEnd * n - n, n, MPI_DOUBLE, nodeIndex + 1,
0, MPI_COMM_WORLD, &requestSendNext);
MPI_Irecv(m + jobEnd * n, n, MPI_DOUBLE, nodeIndex + 1,
1, MPI_COMM_WORLD, &requestReceiveNext);
}
//Previous
if (nodeIndex != 0)
{
MPI_Isend(m + jobStartingPoint * n, n, MPI_DOUBLE,
nodeIndex - 1, 1, MPI_COMM_WORLD, &requestSendPrevious)
MPI_Irecv(m + jobStartingPoint * n - n, n , MPI_DOUBLE,
nodeIndex - 1, 0, MPI_COMM_WORLD,
&requestReceivePrevious);
}
}
else
{
//Previous
MPI_Irecv(m + jobStartingPoint * n - n, n, MPI_DOUBLE,
nodeIndex - 1, 0, MPI_COMM_WORLD,
&requestReceivePrevious);
MPI_Isend(m + jobStartingPoint * n, n, MPI_DOUBLE,
nodeIndex - 1, 1, MPI_COMM_WORLD, &requestSendPrevious);
//Next
if (nodeIndex != nodeCount - 1)
{
MPI_Irecv(m + jobEnd * n, n, MPI_DOUBLE,
nodeIndex + 1, 1, MPI_COMM_WORLD, &requestSendNext);
MPI_Isend(m + jobEnd * n - n, n, MPI_DOUBLE,
nodeIndex + 1, 0, MPI_COMM_WORLD, &requestReceiveNext);
}
}
//Compute Inner Data
localCount = 0;
for(int i = jobStartingPoint + 1; i < jobEnd - 1; i++)
rowDataIteration(n, *step, epsilon, i, m, w, &localCount);
//Compute boundary data
int rowIndex;
if (nodeIndex == 0)
{
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceiveNext, &statusNext);
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendNext, &statusNext);
}
else if (nodeIndex == nodeCount - 1)
{
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceivePrevious, &statusPrevious);
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendPrevious, &statusPrevious);
}
else
{
//还可以MPI_Waitany()来提高性能***
MPI_Wait(&requestReceivePrevious, &statusPrevious);
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceiveNext, &statusNext);
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendPrevious, &statusPrevious);
MPI_Wait(&requestSendNext, &statusNext);
}
totalCount = 0;
MPI_Allreduce(&localCount, &totalCount, 1,
MPI_INT, MPI_SUM, MPI_COMM_WORLD);
temp = m; m = w; w = temp;
}
{
step++;
//Synchronize boundary data
if (nodeIndex % 2 == 0)
{
//Next
if (nodeIndex != nodeCount - 1)
{
MPI_Isend(m + jobEnd * n - n, n, MPI_DOUBLE, nodeIndex + 1,
0, MPI_COMM_WORLD, &requestSendNext);
MPI_Irecv(m + jobEnd * n, n, MPI_DOUBLE, nodeIndex + 1,
1, MPI_COMM_WORLD, &requestReceiveNext);
}
//Previous
if (nodeIndex != 0)
{
MPI_Isend(m + jobStartingPoint * n, n, MPI_DOUBLE,
nodeIndex - 1, 1, MPI_COMM_WORLD, &requestSendPrevious)
MPI_Irecv(m + jobStartingPoint * n - n, n , MPI_DOUBLE,
nodeIndex - 1, 0, MPI_COMM_WORLD,
&requestReceivePrevious);
}
}
else
{
//Previous
MPI_Irecv(m + jobStartingPoint * n - n, n, MPI_DOUBLE,
nodeIndex - 1, 0, MPI_COMM_WORLD,
&requestReceivePrevious);
MPI_Isend(m + jobStartingPoint * n, n, MPI_DOUBLE,
nodeIndex - 1, 1, MPI_COMM_WORLD, &requestSendPrevious);
//Next
if (nodeIndex != nodeCount - 1)
{
MPI_Irecv(m + jobEnd * n, n, MPI_DOUBLE,
nodeIndex + 1, 1, MPI_COMM_WORLD, &requestSendNext);
MPI_Isend(m + jobEnd * n - n, n, MPI_DOUBLE,
nodeIndex + 1, 0, MPI_COMM_WORLD, &requestReceiveNext);
}
}
//Compute Inner Data
localCount = 0;
for(int i = jobStartingPoint + 1; i < jobEnd - 1; i++)
rowDataIteration(n, *step, epsilon, i, m, w, &localCount);
//Compute boundary data
int rowIndex;
if (nodeIndex == 0)
{
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceiveNext, &statusNext);
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendNext, &statusNext);
}
else if (nodeIndex == nodeCount - 1)
{
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceivePrevious, &statusPrevious);
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendPrevious, &statusPrevious);
}
else
{
//还可以MPI_Waitany()来提高性能***
MPI_Wait(&requestReceivePrevious, &statusPrevious);
rowIndex = jobStartingPoint;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestReceiveNext, &statusNext);
rowIndex = jobEnd - 1;
rowDataIteration(n, *step, epsilon, rowIndex, m, w, &localCount);
MPI_Wait(&requestSendPrevious, &statusPrevious);
MPI_Wait(&requestSendNext, &statusNext);
}
totalCount = 0;
MPI_Allreduce(&localCount, &totalCount, 1,
MPI_INT, MPI_SUM, MPI_COMM_WORLD);
temp = m; m = w; w = temp;
}