技术开发 频道

CUDA SDK 2.3のconvolutionSeparable

  下面再来看这段kernel代码,顺风顺水了是不是。。

// Row convolution filter

#define   ROWS_BLOCKDIM_X
16

#define   ROWS_BLOCKDIM_Y
4

#define ROWS_RESULT_STEPS
4

#define   ROWS_HALO_STEPS
1



__global__
void convolutionRowsKernel(

    
float *d_Dst,

    
float *d_Src,

    
int imageW,

    
int imageH,

    
int pitch

){

    __shared__
float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];



    
//Offset to the left halo edge

    
const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;

    
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;



    d_Src
+= baseY * pitch + baseX;

    d_Dst
+= baseY * pitch + baseX;



    
//Main data

    #pragma unroll

    
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++)//i=1,i<5

        s_Data[threadIdx.y][threadIdx.x
+ i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X];



    
//Left halo

    
for(int i = 0; i < ROWS_HALO_STEPS; i++){//i=0,i<1

        s_Data[threadIdx.y][threadIdx.x
+ i * ROWS_BLOCKDIM_X] =

            (baseX
>= -i * ROWS_BLOCKDIM_X ) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;

    }



    
//Right halo

    
for(int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++){

        s_Data[threadIdx.y][threadIdx.x
+ i * ROWS_BLOCKDIM_X] =

            (imageW
- baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;

    }



    
//Compute and store results

    __syncthreads();

    #pragma unroll

    
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++){

        
float sum = 0;



        #pragma unroll

        
for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++)

            sum
+= c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];



        d_Dst[i
* ROWS_BLOCKDIM_X] = sum;

    }

}



extern
"C" void convolutionRowsGPU(

    
float *d_Dst,

    
float *d_Src,

    
int imageW,

    
int imageH

){

    
assert( ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS );

    
assert( imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0 );

    
assert( imageH % ROWS_BLOCKDIM_Y == 0 );



    dim3 blocks(imageW
/ (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), imageH / ROWS_BLOCKDIM_Y);

    dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);



    convolutionRowsKernel
<<<blocks, threads>>>(

        d_Dst, d_Src, imageW, imageH, imageW

    );

    cutilCheckMsg(
"convolutionRowsKernel() execution failed\n");

}
0
相关文章