线性存储器也可以通过cudaMallocPitch()和cudaMalloc3D()分配。在分配2D或3D数组的时候,推荐使用,因为这些分配增加了合适的填充以满足5.3.2.1节的对齐要求,在按行访问时或者在二维数组和设备存储器的其它区域间复制(用cudaMemcpy2D()和cudaMemcpy3D()函数)时,保证了非常好的性能。返回的步长(pitch,stride)必须用于访问数组元素。下面的代码分配了一个尺寸为width*height的二维浮点数组,同时演示了怎样在设备代码中遍历数组元素。
// Host code
float* devPtr; int pitch;
cudaMallocPitch((void**)&devPtr, &pitch,
width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch);
// Device code
__global__ void MyKernel(float* devPtr, int pitch) {
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c) {
float element = row[c];
float* devPtr; int pitch;
cudaMallocPitch((void**)&devPtr, &pitch,
width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch);
// Device code
__global__ void MyKernel(float* devPtr, int pitch) {
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c) {
float element = row[c];
演示
// Host code
cudaPitchedPtr devPitchedPtr;
cudaExtent extent = make_cudaExtent(64, 64, 64);
cudaMalloc3D(&devPitchedPtr, extent);
MyKernel<<<100, 512>>>(devPitchedPtr, extent);
// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr, cudaExtent extent) {
char* devPtr = devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * extent.height;
for (int z = 0; z < extent.depth; ++z) {
char* slice = devPtr + z * slicePitch;
for (int y = 0; y < extent.height; ++y) {
float* row = (float*)(slice + y * pitch);
for (int x = 0; x < extent.width; ++x) {
cudaPitchedPtr devPitchedPtr;
cudaExtent extent = make_cudaExtent(64, 64, 64);
cudaMalloc3D(&devPitchedPtr, extent);
MyKernel<<<100, 512>>>(devPitchedPtr, extent);
// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr, cudaExtent extent) {
char* devPtr = devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * extent.height;
for (int z = 0; z < extent.depth; ++z) {
char* slice = devPtr + z * slicePitch;
for (int y = 0; y < extent.height; ++y) {
float* row = (float*)(slice + y * pitch);
for (int x = 0; x < extent.width; ++x) {
参考手册列出了在cudaMalloc()分配的线性存储器,cudaMallocPitch()或cudaMalloc3D()分配的线性存储器,CUDA数组和为声明在全局存储器和常量存储器空间分配的存储器之间拷贝的所有各种函数。
下面的例子代码复制了一些主机存储器数组到常量存储器中:
__constant__ float constData[256];
float data[256];
cudaMemcpyToSymbol(constData, data, sizeof(data));
float data[256];
cudaMemcpyToSymbol(constData, data, sizeof(data));
为声明在全局存储器空间的变量分配的存储器的地址,可以使用cudaGetSymbolAddress()函数检索到。分配的存储器的尺寸可以通过cudaGetSymbolSize()函数获得。