技术开发 频道

GPU高性能开发技术:CUDA实战

  如果你想以一个程序(不生成dll)运行它,在“项目属性”*“配置属性”*“常规”部分,将“配置类型修”改为“应用程序(.exe)”,这样就允许你使用CUDA Profiler(位于“C:\CUDA\cudaprof\bin\”)。

  文件dllmain.cu也有变量可从外部dll访问:

// external variable example
extern "C"
{
float __declspec(dllexport) sExecutionTime = -1;
}
// variable wrapper function
extern "C" float __declspec(dllexport) __stdcall GetExecutionTime()
{
return sExecutionTime;
}

  虽然可以直接访问变量,但使用包装函数方法更简单(就像get/set存取器)。

  最终的内核函数示例:

// cuda kernel (internal)
__global__ void some_calculations(float *a, unsigned int N, unsigned int M)
{
      unsigned
int idx = blockIdx.x * blockDim.x + threadIdx.x;
      
if (idx < N)
      {
            
// note1: no need for shared memory here
            
// note2: global memory access is coalesced
            
//        (no structs, float only used)

            
// do computations M times on each thread
            
// to extend processor's time
            for(unsigned int i = 0; i < M; i++)
            {
                  
// some easy arithmetics            
                  a[idx] = a[idx] * a[idx] * 0.1 - a[idx] - 10;
            }
      }
}

 

 

  不同CPU版本执行时间对比:

extern "C" void __declspec(dllexport) __stdcall SomeCalculationsCPU
      (
      
float *a_h,
      
const unsigned int N,
      
const unsigned int M
      )
      unsigned
int timer = 0;
      cutCreateTimer(
&timer);
      cutStartTimer(timer);
      
for(unsigned int i = 0; i < N; i++)
            
for(unsigned int j = 0; j < M; j++)
                  
*(a_h + i) = *(a_h + i) * *(a_h + i) * 0.1 - *(a_h + i) - 10;
      cutStopTimer(timer);
      sExecutionTime
= cutGetTimerValue(timer);
}
0
相关文章