技术开发 频道

向量乘以矩阵

  【IT168 文档】/********************************************************************

  created: 2009/09/19

  created: 19:9:2009 12:00

  filename: vector_matrix_multiplication.cu

  file base: vector_matrix_multiplication

  file ext: CUDA

  author: zhao.kaiyong(at)gmail.com

  purpose: vector matrix multiplication

  copyright: everyone can use this code, please specify source.

  任意使用,请注明出处;

  http://www.hpctech.com

  http://openhero.net

  *********************************************************************/

  #include

  #include

  #include

  #include

  /************************************************************************/

  /* Init CUDA */

  /************************************************************************/

#if __DEVICE_EMULATION__

bool InitCUDA(
void){return true;}

#
else
bool InitCUDA(
void)
{
    
int count = 0;
    
int i = 0;

    cudaGetDeviceCount(
&count);
    
if(count == 0) {
        fprintf(stderr,
"There is no device.\n");
        
return false;
    }

    
for(i = 0; i < count; i++) {
        cudaDeviceProp prop;
        
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            
if(prop.major >= 1) {
                
break;
            }
        }
    }
    
if(i == count) {
        fprintf(stderr,
"There is no device supporting CUDA.\n");
        
return false;
    }
    cudaSetDevice(i);

    printf(
"CUDA initialized.\n");
    
return true;
}

#endif
/************************************************************************/
/* Example                                                              */
/************************************************************************/
__global__
static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)
{
    __shared__
float subA[64];

    A
= A + threadIdx.x;
    B
= B + blockIdx.x * 64 + threadIdx.x;
    C
= C + blockIdx.x * 64 + threadIdx.x;

    
float subC = 0.0;

    
for (int i = 0; i < wA; i+=64)
    {
        subA[threadIdx.x]
= A[i];
        __syncthreads();

#pragma unroll
        
for (int j = 0; j < 64; j++, B += wB)
        {
            subC
+= subA[j] * B[0];
        }
        __syncthreads();
    }

    C[
0] = subC;
}

//__global__ static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)
//{

//}

#define  RUN_TEST

#define  WA
32
#define  WB
64
/************************************************************************/
/* HelloCUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{

    
if(!InitCUDA()) {
        
return 0;
    }

    srand(
2009);

    
long wA = 64* WA;
    
long wB = 64* WB;

    
long size_A = wA;
    
long size_B = wA*wB;
    
long size_C = wB;

    
float    *hA = (float*)malloc(sizeof(float) * size_A);
    
float    *hB = (float*)malloc(sizeof(float) * size_B);
    
float    *hC = (float*)malloc(sizeof(float) * size_C);
    
float    *testhC = (float*)malloc(sizeof(float) * size_C);

    
for (int i = 0; i < size_A; i++)
    {
        hA[i]
= (float)rand()/(float)RAND_MAX;
    }

    
for (int i = 0; i < size_B; i++)
    {
        hB[i]
= (float)rand()/(float)RAND_MAX;
    }

    
float    *dA    = 0;
    
float    *dB = 0;
    
float    *dC = 0;

    CUDA_SAFE_CALL( cudaMalloc((
void**) &dA, sizeof(float) * size_A));
    CUDA_SAFE_CALL( cudaMalloc((
void**) &dB, sizeof(float) * size_B));
    CUDA_SAFE_CALL( cudaMalloc((
void**) &dC, sizeof(float) * size_C));

    CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(
float)*size_A, cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(
float)*size_B, cudaMemcpyHostToDevice));

    unsigned
int timer = 0;
    CUT_SAFE_CALL( cutCreateTimer(
&timer));
    CUT_SAFE_CALL( cutStartTimer( timer));

    dim3 threads
= 64;
    dim3 blocks
= wB/64;
    vector_matrix_mult_kernel
<<>>(dA, wA, dB, wB, dC);

    CUT_CHECK_ERROR(
"Kernel execution failed\n");

    CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(
float) * size_C, cudaMemcpyDeviceToHost));
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf(
"Processing time: %f (ms)\n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutResetTimer( timer));

    
for (int i = 0; i < wB; i++)
    {
        
float subC = 0.0;
        
for (int j = 0; j         {
            subC
+= hA[j] * hB[j*wB + i];
        }
        testhC[i]
= subC;
    }

    CUT_SAFE_CALL( cutStopTimer( timer));
    printf(
"Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

#ifdef RUN_TEST

    CUTBoolean res
= cutCompareL2fe(testhC, hC, size_C, 1e-6f);
    printf(
"Test %s \n", (1 == res) ? "PASSED":"FAILED");
#endif

    CUDA_SAFE_CALL( cudaFree(dA));
    CUDA_SAFE_CALL( cudaFree(dB));
    CUDA_SAFE_CALL( cudaFree(dC));
    free(hA);
    free(hB);
    free(hC);
    CUT_EXIT(argc, argv);

    
return 0;
}
0
相关文章