下面再来看这段kernel代码,顺风顺水了是不是。。
// Row convolution filter
#define ROWS_BLOCKDIM_X 16
#define ROWS_BLOCKDIM_Y 4
#define ROWS_RESULT_STEPS 4
#define ROWS_HALO_STEPS 1
__global__ void convolutionRowsKernel(
float *d_Dst,
float *d_Src,
int imageW,
int imageH,
int pitch
){
__shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];
//Offset to the left halo edge
const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
//Main data
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++)//i=1,i<5
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X];
//Left halo
for(int i = 0; i < ROWS_HALO_STEPS; i++){//i=0,i<1
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(baseX >= -i * ROWS_BLOCKDIM_X ) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
//Right halo
for(int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++){
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
//Compute and store results
__syncthreads();
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++){
float sum = 0;
#pragma unroll
for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++)
sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
d_Dst[i * ROWS_BLOCKDIM_X] = sum;
}
}
extern "C" void convolutionRowsGPU(
float *d_Dst,
float *d_Src,
int imageW,
int imageH
){
assert( ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS );
assert( imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0 );
assert( imageH % ROWS_BLOCKDIM_Y == 0 );
dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), imageH / ROWS_BLOCKDIM_Y);
dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);
convolutionRowsKernel<<<blocks, threads>>>(
d_Dst, d_Src, imageW, imageH, imageW
);
cutilCheckMsg("convolutionRowsKernel() execution failed\n");
}
#define ROWS_BLOCKDIM_X 16
#define ROWS_BLOCKDIM_Y 4
#define ROWS_RESULT_STEPS 4
#define ROWS_HALO_STEPS 1
__global__ void convolutionRowsKernel(
float *d_Dst,
float *d_Src,
int imageW,
int imageH,
int pitch
){
__shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];
//Offset to the left halo edge
const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
//Main data
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++)//i=1,i<5
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X];
//Left halo
for(int i = 0; i < ROWS_HALO_STEPS; i++){//i=0,i<1
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(baseX >= -i * ROWS_BLOCKDIM_X ) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
//Right halo
for(int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++){
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
//Compute and store results
__syncthreads();
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++){
float sum = 0;
#pragma unroll
for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++)
sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
d_Dst[i * ROWS_BLOCKDIM_X] = sum;
}
}
extern "C" void convolutionRowsGPU(
float *d_Dst,
float *d_Src,
int imageW,
int imageH
){
assert( ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS );
assert( imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0 );
assert( imageH % ROWS_BLOCKDIM_Y == 0 );
dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), imageH / ROWS_BLOCKDIM_Y);
dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);
convolutionRowsKernel<<<blocks, threads>>>(
d_Dst, d_Src, imageW, imageH, imageW
);
cutilCheckMsg("convolutionRowsKernel() execution failed\n");
}