正如你看到的,有很多不同的情景使用多维索引更好,另一个例子是,当你处理二维图像时,你可以在相同的坐标系统为每个像素创建一个块,如下图所示。
最后,完整的代码如下:
#include <stdio.h>
#define BLOCKS 10
__global__ void generateArray( int *hostArray )
{
int ThreadIndex = blockIdx.x + blockIdx.y * BLOCKS;
hostArray[ThreadIndex] = ThreadIndex;
}
int main( void )
{
int hostArray[BLOCKS][BLOCKS];
int *deviceArray;
dim3 multiBlockArray(BLOCKS,BLOCKS);
cudaMalloc( (void**)&deviceArray, BLOCKS * BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<multiBlockArray,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d”, hostArray[0][i] );
for (int j=1; j<BLOCKS; j++)
{
printf( ” %d”, hostArray[j][i] );
}
printf( “\n” );
}
cudaFree( deviceArray );
return 0;
}
#define BLOCKS 10
__global__ void generateArray( int *hostArray )
{
int ThreadIndex = blockIdx.x + blockIdx.y * BLOCKS;
hostArray[ThreadIndex] = ThreadIndex;
}
int main( void )
{
int hostArray[BLOCKS][BLOCKS];
int *deviceArray;
dim3 multiBlockArray(BLOCKS,BLOCKS);
cudaMalloc( (void**)&deviceArray, BLOCKS * BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<multiBlockArray,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d”, hostArray[0][i] );
for (int j=1; j<BLOCKS; j++)
{
printf( ” %d”, hostArray[j][i] );
}
printf( “\n” );
}
cudaFree( deviceArray );
return 0;
}