实现main()函数
接下来,我们要看看我们的main函数,这里应该没有什么新东西:
int main( void )
{
int hostArray[BLOCKS];
int *deviceArray;
cudaMalloc( (void**)&deviceArray, BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<BLOCKS,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d\n”, hostArray[i] );
}
cudaFree( deviceArray );
return 0;
}
{
int hostArray[BLOCKS];
int *deviceArray;
cudaMalloc( (void**)&deviceArray, BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<BLOCKS,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d\n”, hostArray[i] );
}
cudaFree( deviceArray );
return 0;
}
首先,我们按BLOCKS大小创建一个数组,在设备上未数组分配空间,并调用:
generateArray<<<BLOCKS,1>>>( deviceArray );.
这个函数将在BLOCKS并行内核中运行,在一个调用中创建好全部数组。
这个操作完成后,我们将结果从设备拷贝到主机,并将它打印在屏幕上,释放数组,最后退出。
整个应用程序的源代码如下:
#include <stdio.h>
#define BLOCKS 25
__global__ void generateArray( int *hostArray )
{
int ThreadIndex = blockIdx.x;
hostArray[ThreadIndex] = ThreadIndex;
}
int main( void )
{
int hostArray[BLOCKS];
int *deviceArray;
cudaMalloc( (void**)&deviceArray, BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<BLOCKS,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d\n”, hostArray[i] );
}
cudaFree( deviceArray );
return 0;
}
#define BLOCKS 25
__global__ void generateArray( int *hostArray )
{
int ThreadIndex = blockIdx.x;
hostArray[ThreadIndex] = ThreadIndex;
}
int main( void )
{
int hostArray[BLOCKS];
int *deviceArray;
cudaMalloc( (void**)&deviceArray, BLOCKS * sizeof(int) );
cudaMemcpy( deviceArray,
hostArray, BLOCKS * sizeof(int),
cudaMemcpyHostToDevice );
generateArray<<<BLOCKS,1>>>( deviceArray );
cudaMemcpy( hostArray,
deviceArray,
BLOCKS * sizeof(int),
cudaMemcpyDeviceToHost );
for (int i=0; i<BLOCKS; i++)
{
printf( “Thread ID running: %d\n”, hostArray[i] );
}
cudaFree( deviceArray );
return 0;
}
现在编译并运行这段代码,你将会看到像下面这样的输出:
▲图 2 程序运行输出结果
恭喜,你已经使用CUDA成功创建了你的第一个并行应用程序!