如果你想以一个程序(不生成dll)运行它,在“项目属性”*“配置属性”*“常规”部分,将“配置类型修”改为“应用程序(.exe)”,这样就允许你使用CUDA Profiler(位于“C:\CUDA\cudaprof\bin\”)。
文件dllmain.cu也有变量可从外部dll访问:
// external variable example
extern "C"
{
float __declspec(dllexport) sExecutionTime = -1;
}
// variable wrapper function
extern "C" float __declspec(dllexport) __stdcall GetExecutionTime()
{
return sExecutionTime;
}
extern "C"
{
float __declspec(dllexport) sExecutionTime = -1;
}
// variable wrapper function
extern "C" float __declspec(dllexport) __stdcall GetExecutionTime()
{
return sExecutionTime;
}
虽然可以直接访问变量,但使用包装函数方法更简单(就像get/set存取器)。
最终的内核函数示例:
// cuda kernel (internal)
__global__ void some_calculations(float *a, unsigned int N, unsigned int M)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
// note1: no need for shared memory here
// note2: global memory access is coalesced
// (no structs, float only used)
// do computations M times on each thread
// to extend processor's time
for(unsigned int i = 0; i < M; i++)
{
// some easy arithmetics
a[idx] = a[idx] * a[idx] * 0.1 - a[idx] - 10;
}
}
}
__global__ void some_calculations(float *a, unsigned int N, unsigned int M)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
// note1: no need for shared memory here
// note2: global memory access is coalesced
// (no structs, float only used)
// do computations M times on each thread
// to extend processor's time
for(unsigned int i = 0; i < M; i++)
{
// some easy arithmetics
a[idx] = a[idx] * a[idx] * 0.1 - a[idx] - 10;
}
}
}
不同CPU版本执行时间对比:
extern "C" void __declspec(dllexport) __stdcall SomeCalculationsCPU
(
float *a_h,
const unsigned int N,
const unsigned int M
)
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);
for(unsigned int i = 0; i < N; i++)
for(unsigned int j = 0; j < M; j++)
*(a_h + i) = *(a_h + i) * *(a_h + i) * 0.1 - *(a_h + i) - 10;
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
}
(
float *a_h,
const unsigned int N,
const unsigned int M
)
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);
for(unsigned int i = 0; i < N; i++)
for(unsigned int j = 0; j < M; j++)
*(a_h + i) = *(a_h + i) * *(a_h + i) * 0.1 - *(a_h + i) - 10;
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
}