技术开发 频道

CUDA初探:GPU的并行计算

  下面开始做实验,我把DATA_SIZE改成了32768,程序如下:

#include <stdio.h>
#include
<stdlib.h>
#include
<cuda_runtime.h>

#define DATA_SIZE 32768
#define THREAD_NUM 1

bool InitCUDA()
{
    
int count;
    cudaGetDeviceCount(
&count);
    
if(count==0)
    {
        fprintf(stderr,
"There is no device.\n");
        
return false;
    }
    
int i;
    
for(i=0;i<count;i++)
    {
        cudaDeviceProp prop;
        
if(cudaGetDeviceProperties(&prop,i) == cudaSuccess)
        {
            
if(prop.major>=1)
            {
                
//枚举详细信息
                printf("Identify: %s\n",prop.name);
                printf(
"Host Memory: %d\n",prop.canMapHostMemory);                
                printf(
"Clock Rate: %d khz\n",prop.clockRate);                
                printf(
"Compute Mode: %d\n",prop.computeMode);                
                printf(
"Device Overlap: %d\n",prop.deviceOverlap);                
                printf(
"Integrated: %d\n",prop.integrated);                
                printf(
"Kernel Exec Timeout Enabled: %d\n",prop.kernelExecTimeoutEnabled);                
                printf(
"Max Grid Size: %d * %d * %d\n",prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
                printf(
"Max Threads Dim: %d * %d * %d\n",prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
                printf(
"Max Threads per Block: %d\n",prop.maxThreadsPerBlock);
                printf(
"Maximum Pitch: %d bytes\n",prop.memPitch);
                printf(
"Minor Compute Capability: %d\n",prop.minor);
                printf(
"Number of Multiprocessors: %d\n",prop.multiProcessorCount);                
                printf(
"32bit Registers Availble per Block: %d\n",prop.regsPerBlock);
                printf(
"Shared Memory Available per Block: %d bytes\n",prop.sharedMemPerBlock);
                printf(
"Alignment Requirement for Textures: %d\n",prop.textureAlignment);
                printf(
"Constant Memory Available: %d bytes\n",prop.totalConstMem);
                printf(
"Global Memory Available: %d bytes\n",prop.totalGlobalMem);
                printf(
"Warp Size: %d threads\n",prop.warpSize);
                printf(
"===================================\n");
                
break;
            }
        }
    }
    
if(i==count)
    {
        fprintf(stderr,
"There is no device supporting CUDA.\n");
        
return false;
    }
    cudaSetDevice(i);
    
return true;
}

void GenerateNums(int *numbers, int size)
{
    
for(int i=0;i<size;i++)
    {
        numbers[i]
=rand()%10;   //产生0-9的随机数
    }
}

__global__
static void SumSquares(int *num, int *result, clock_t *time)  //计算平方和线程
{
    
int sum=0;
    
int i;
    clock_t start;
    
const int tid = threadIdx.x;
    
const int size = DATA_SIZE/THREAD_NUM;
    
if(tid==0) start = clock();
    
for(i=tid*size;i<(tid+1)*size;i++)
    {
        sum
+= num[i]*num[i];
    }
    result[tid]
= sum;
    
if(tid==0) *time=clock()-start;
}

static void TestGPU()
{
    
int data[DATA_SIZE];
    GenerateNums(data,DATA_SIZE);
    
int *gpudata, *result;
    clock_t
*time;
    cudaMalloc((
void**)&gpudata,sizeof(int)*DATA_SIZE);
    cudaMalloc((
void**)&result,sizeof(int)*THREAD_NUM);
    cudaMalloc((
void**)&time,sizeof(clock_t));
    cudaMemcpy(gpudata,data,
sizeof(int)*DATA_SIZE,cudaMemcpyHostToDevice);
    clock_t gpustart
= clock();
    SumSquares
<<<1,THREAD_NUM,0>>>(gpudata,result,time);
    
int sum[THREAD_NUM];
    clock_t time_used;
    cudaMemcpy(
&sum,result,sizeof(int)*THREAD_NUM,cudaMemcpyDeviceToHost);
    cudaMemcpy(
&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);
    
int final_sum=0;
    
for(int i=0;i<THREAD_NUM;i++)
    {
        final_sum
+=sum[i];
    }
    clock_t gputime
= clock()-gpustart;
    printf(
"GPU sum: %d, cycle used %d, time used: %d\n",final_sum, time_used, gputime);

    
int sum1 = 0;    
    
for(int i=0;i<DATA_SIZE;i++)
    {
        sum1
+= data[i]*data[i];
    }    
    printf(
"CPU sum: %d\n",sum1);
}

void main()
{
    
if(!InitCUDA())
        {
            getchar();
            
return;
        }
        printf(
"CUDA initialized.\n");
        
        
/**//////////////////////////////////
        TestGPU();    
        
        getchar();

}

  最开始设定THREAD_NUM为1,这其实就不算并行运算了,效果肯定很差,运行结果如下:

实例练习

  总共耗费了21M个cycle

  将THREAD_NUM改为512后,结果如下:

实例练习

  只耗费了98224个cycle

  根据原文最后的分析,我们再进行修改,让实际执行结果是连续的存取,应该要让 thread 0 读取第一个数字,thread 1 读取第二个数字…依此类推。所以,将SumSquare函数改成如下:

__global__ static void SumSquares(int *num, int *result, clock_t *time)  //计算平方和线程
{
    
int sum=0;
    
int i;
    clock_t start;
    
const int tid = threadIdx.x;    
    
if(tid==0) start = clock();
    
for(i=tid;i<DATA_SIZE;i+=THREAD_NUM)
    {
        sum
+= num[i]*num[i];
    }
    result[tid]
= sum;
    
if(tid==0) *time=clock()-start;
}

  运行结果如下:

实例练习

        更多内容请点击:

        CUDA专区:http://cuda.it168.com/

        CUDA论坛:http://cudabbs.it168.com/

 

0
相关文章