【IT168 文档】这个是个典型的利用 shared memory 来使内存读写加速的例子, CUDA SDK 中有个 transpose_native, 是采用比较原始的办法, 将每列的第一个元素读出来凑成一行, 这个办法显而易见是比较的慢的,比上面这个程序要慢10倍以上.
#define BLOCK_DIM 16
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
使用16x16的块大小来做运算,个人认为有两个原因:
1)在X86平台上由于 cache line 可以产生同样的效果, 但是GPU中为了放更多的运算单元, 并没有制造很多的 Cache Line, 但是查看nvidia 的GPU,发现每次数据传输的基本数量是 256bit, 或者 512bit, 假设是 512bit, 则每次是传输 16 个单精度浮点数, 所以对应的你也至少要每次出来一行中的 16 个单精度浮点数据, 才能充分利用显存带宽
2)如果使用8X8: 64 threads/block. 每个SM至多接受768 threads, 即12 blocks。但是, SM至多接受8 blocks, 所以实际上仅有512threads!如果使用16X16: 256 threads/block.每个SM至多接受768 threads, 即3 blocks 只要其它计算资源许可,可以满负荷工作,SM满负荷工作效率最高!