下面开始做实验,我把DATA_SIZE改成了32768,程序如下:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define DATA_SIZE 32768
#define THREAD_NUM 1
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if(count==0)
{
fprintf(stderr,"There is no device.\n");
return false;
}
int i;
for(i=0;i<count;i++)
{
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop,i) == cudaSuccess)
{
if(prop.major>=1)
{
//枚举详细信息
printf("Identify: %s\n",prop.name);
printf("Host Memory: %d\n",prop.canMapHostMemory);
printf("Clock Rate: %d khz\n",prop.clockRate);
printf("Compute Mode: %d\n",prop.computeMode);
printf("Device Overlap: %d\n",prop.deviceOverlap);
printf("Integrated: %d\n",prop.integrated);
printf("Kernel Exec Timeout Enabled: %d\n",prop.kernelExecTimeoutEnabled);
printf("Max Grid Size: %d * %d * %d\n",prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
printf("Max Threads Dim: %d * %d * %d\n",prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
printf("Max Threads per Block: %d\n",prop.maxThreadsPerBlock);
printf("Maximum Pitch: %d bytes\n",prop.memPitch);
printf("Minor Compute Capability: %d\n",prop.minor);
printf("Number of Multiprocessors: %d\n",prop.multiProcessorCount);
printf("32bit Registers Availble per Block: %d\n",prop.regsPerBlock);
printf("Shared Memory Available per Block: %d bytes\n",prop.sharedMemPerBlock);
printf("Alignment Requirement for Textures: %d\n",prop.textureAlignment);
printf("Constant Memory Available: %d bytes\n",prop.totalConstMem);
printf("Global Memory Available: %d bytes\n",prop.totalGlobalMem);
printf("Warp Size: %d threads\n",prop.warpSize);
printf("===================================\n");
break;
}
}
}
if(i==count)
{
fprintf(stderr,"There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
return true;
}
void GenerateNums(int *numbers, int size)
{
for(int i=0;i<size;i++)
{
numbers[i]=rand()%10; //产生0-9的随机数
}
}
__global__ static void SumSquares(int *num, int *result, clock_t *time) //计算平方和线程
{
int sum=0;
int i;
clock_t start;
const int tid = threadIdx.x;
const int size = DATA_SIZE/THREAD_NUM;
if(tid==0) start = clock();
for(i=tid*size;i<(tid+1)*size;i++)
{
sum += num[i]*num[i];
}
result[tid] = sum;
if(tid==0) *time=clock()-start;
}
static void TestGPU()
{
int data[DATA_SIZE];
GenerateNums(data,DATA_SIZE);
int *gpudata, *result;
clock_t *time;
cudaMalloc((void**)&gpudata,sizeof(int)*DATA_SIZE);
cudaMalloc((void**)&result,sizeof(int)*THREAD_NUM);
cudaMalloc((void**)&time,sizeof(clock_t));
cudaMemcpy(gpudata,data,sizeof(int)*DATA_SIZE,cudaMemcpyHostToDevice);
clock_t gpustart = clock();
SumSquares<<<1,THREAD_NUM,0>>>(gpudata,result,time);
int sum[THREAD_NUM];
clock_t time_used;
cudaMemcpy(&sum,result,sizeof(int)*THREAD_NUM,cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum=0;
for(int i=0;i<THREAD_NUM;i++)
{
final_sum+=sum[i];
}
clock_t gputime = clock()-gpustart;
printf("GPU sum: %d, cycle used %d, time used: %d\n",final_sum, time_used, gputime);
int sum1 = 0;
for(int i=0;i<DATA_SIZE;i++)
{
sum1 += data[i]*data[i];
}
printf("CPU sum: %d\n",sum1);
}
void main()
{
if(!InitCUDA())
{
getchar();
return;
}
printf("CUDA initialized.\n");
/**//////////////////////////////////
TestGPU();
getchar();
}
#include <stdlib.h>
#include <cuda_runtime.h>
#define DATA_SIZE 32768
#define THREAD_NUM 1
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if(count==0)
{
fprintf(stderr,"There is no device.\n");
return false;
}
int i;
for(i=0;i<count;i++)
{
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop,i) == cudaSuccess)
{
if(prop.major>=1)
{
//枚举详细信息
printf("Identify: %s\n",prop.name);
printf("Host Memory: %d\n",prop.canMapHostMemory);
printf("Clock Rate: %d khz\n",prop.clockRate);
printf("Compute Mode: %d\n",prop.computeMode);
printf("Device Overlap: %d\n",prop.deviceOverlap);
printf("Integrated: %d\n",prop.integrated);
printf("Kernel Exec Timeout Enabled: %d\n",prop.kernelExecTimeoutEnabled);
printf("Max Grid Size: %d * %d * %d\n",prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
printf("Max Threads Dim: %d * %d * %d\n",prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
printf("Max Threads per Block: %d\n",prop.maxThreadsPerBlock);
printf("Maximum Pitch: %d bytes\n",prop.memPitch);
printf("Minor Compute Capability: %d\n",prop.minor);
printf("Number of Multiprocessors: %d\n",prop.multiProcessorCount);
printf("32bit Registers Availble per Block: %d\n",prop.regsPerBlock);
printf("Shared Memory Available per Block: %d bytes\n",prop.sharedMemPerBlock);
printf("Alignment Requirement for Textures: %d\n",prop.textureAlignment);
printf("Constant Memory Available: %d bytes\n",prop.totalConstMem);
printf("Global Memory Available: %d bytes\n",prop.totalGlobalMem);
printf("Warp Size: %d threads\n",prop.warpSize);
printf("===================================\n");
break;
}
}
}
if(i==count)
{
fprintf(stderr,"There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
return true;
}
void GenerateNums(int *numbers, int size)
{
for(int i=0;i<size;i++)
{
numbers[i]=rand()%10; //产生0-9的随机数
}
}
__global__ static void SumSquares(int *num, int *result, clock_t *time) //计算平方和线程
{
int sum=0;
int i;
clock_t start;
const int tid = threadIdx.x;
const int size = DATA_SIZE/THREAD_NUM;
if(tid==0) start = clock();
for(i=tid*size;i<(tid+1)*size;i++)
{
sum += num[i]*num[i];
}
result[tid] = sum;
if(tid==0) *time=clock()-start;
}
static void TestGPU()
{
int data[DATA_SIZE];
GenerateNums(data,DATA_SIZE);
int *gpudata, *result;
clock_t *time;
cudaMalloc((void**)&gpudata,sizeof(int)*DATA_SIZE);
cudaMalloc((void**)&result,sizeof(int)*THREAD_NUM);
cudaMalloc((void**)&time,sizeof(clock_t));
cudaMemcpy(gpudata,data,sizeof(int)*DATA_SIZE,cudaMemcpyHostToDevice);
clock_t gpustart = clock();
SumSquares<<<1,THREAD_NUM,0>>>(gpudata,result,time);
int sum[THREAD_NUM];
clock_t time_used;
cudaMemcpy(&sum,result,sizeof(int)*THREAD_NUM,cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum=0;
for(int i=0;i<THREAD_NUM;i++)
{
final_sum+=sum[i];
}
clock_t gputime = clock()-gpustart;
printf("GPU sum: %d, cycle used %d, time used: %d\n",final_sum, time_used, gputime);
int sum1 = 0;
for(int i=0;i<DATA_SIZE;i++)
{
sum1 += data[i]*data[i];
}
printf("CPU sum: %d\n",sum1);
}
void main()
{
if(!InitCUDA())
{
getchar();
return;
}
printf("CUDA initialized.\n");
/**//////////////////////////////////
TestGPU();
getchar();
}
最开始设定THREAD_NUM为1,这其实就不算并行运算了,效果肯定很差,运行结果如下:
总共耗费了21M个cycle
将THREAD_NUM改为512后,结果如下:
只耗费了98224个cycle
根据原文最后的分析,我们再进行修改,让实际执行结果是连续的存取,应该要让 thread 0 读取第一个数字,thread 1 读取第二个数字…依此类推。所以,将SumSquare函数改成如下:
__global__ static void SumSquares(int *num, int *result, clock_t *time) //计算平方和线程
{
int sum=0;
int i;
clock_t start;
const int tid = threadIdx.x;
if(tid==0) start = clock();
for(i=tid;i<DATA_SIZE;i+=THREAD_NUM)
{
sum += num[i]*num[i];
}
result[tid] = sum;
if(tid==0) *time=clock()-start;
}
{
int sum=0;
int i;
clock_t start;
const int tid = threadIdx.x;
if(tid==0) start = clock();
for(i=tid;i<DATA_SIZE;i+=THREAD_NUM)
{
sum += num[i]*num[i];
}
result[tid] = sum;
if(tid==0) *time=clock()-start;
}
运行结果如下:
更多内容请点击:
CUDA专区:http://cuda.it168.com/
CUDA论坛:http://cudabbs.it168.com/