向量乘以矩阵-技术开发专区

向量乘以矩阵

作者：openhero 编辑：覃里 2009-11-27 17:27 来源：openher

　　【IT168 文档】/********************************************************************

　　created: 2009/09/19

　　created: 19:9:2009 12:00

　　filename: vector_matrix_multiplication.cu

　　file base: vector_matrix_multiplication

　　file ext: CUDA

　　author: zhao.kaiyong(at)gmail.com

　　purpose: vector matrix multiplication

　　任意使用，请注明出处;

　　http://www.hpctech.com

　　http://openhero.net

　　*********************************************************************/

　　#include

　　/************************************************************************/

　　/* Init CUDA */

　　/************************************************************************/

#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else
bool InitCUDA(void)
{
    int count = 0;
    int i = 0;

    cudaGetDeviceCount(&count);
    if(count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }

    for(i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if(prop.major >= 1) {
                break;
            }
        }
    }
    if(i == count) {
        fprintf(stderr, "There is no device supporting CUDA.\n");
        return false;
    }
    cudaSetDevice(i);

    printf("CUDA initialized.\n");
    return true;
}

#endif
/************************************************************************/
/* Example                                                              */
/************************************************************************/
__global__ static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)
{
    __shared__ float subA[64];

    A = A + threadIdx.x;
    B = B + blockIdx.x * 64 + threadIdx.x;
    C = C + blockIdx.x * 64 + threadIdx.x;

    float subC = 0.0;

    for (int i = 0; i < wA; i+=64)
    {
        subA[threadIdx.x] = A[i];
        __syncthreads();

#pragma unroll
        for (int j = 0; j < 64; j++, B += wB)
        {
            subC += subA[j] * B[0];
        }
        __syncthreads();
    }

    C[0] = subC;
}

//__global__ static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)
//{

//}

#define  RUN_TEST

#define  WA 32
#define  WB 64
/************************************************************************/
/* HelloCUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{

    if(!InitCUDA()) {
        return 0;
    }

    srand(2009);

    long wA = 64* WA;
    long wB = 64* WB;

    long size_A = wA;
    long size_B = wA*wB;
    long size_C = wB;

    float    *hA = (float*)malloc(sizeof(float) * size_A);
    float    *hB = (float*)malloc(sizeof(float) * size_B);
    float    *hC = (float*)malloc(sizeof(float) * size_C);
    float    *testhC = (float*)malloc(sizeof(float) * size_C);

    for (int i = 0; i < size_A; i++)
    {
        hA[i] = (float)rand()/(float)RAND_MAX;
    }

    for (int i = 0; i < size_B; i++)
    {
        hB[i] = (float)rand()/(float)RAND_MAX;
    }

    float    *dA    = 0;
    float    *dB = 0;
    float    *dC = 0;

    CUDA_SAFE_CALL( cudaMalloc((void**) &dA, sizeof(float) * size_A));
    CUDA_SAFE_CALL( cudaMalloc((void**) &dB, sizeof(float) * size_B));
    CUDA_SAFE_CALL( cudaMalloc((void**) &dC, sizeof(float) * size_C));

    CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(float)*size_A, cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(float)*size_B, cudaMemcpyHostToDevice));

    unsigned int timer = 0;
    CUT_SAFE_CALL( cutCreateTimer( &timer));
    CUT_SAFE_CALL( cutStartTimer( timer));

    dim3 threads = 64;
    dim3 blocks = wB/64;
    vector_matrix_mult_kernel<<>>(dA, wA, dB, wB, dC);

    CUT_CHECK_ERROR("Kernel execution failed\n");

    CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(float) * size_C, cudaMemcpyDeviceToHost));
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutResetTimer( timer));

    for (int i = 0; i < wB; i++)
    {
        float subC = 0.0;
        for (int j = 0; j         {
            subC += hA[j] * hB[j*wB + i];
        }
        testhC[i] = subC;
    }

    CUT_SAFE_CALL( cutStopTimer( timer));
    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

#ifdef RUN_TEST

    CUTBoolean res = cutCompareL2fe(testhC, hC, size_C, 1e-6f);
    printf("Test %s \n", (1 == res) ? "PASSED":"FAILED");
#endif

    CUDA_SAFE_CALL( cudaFree(dA));
    CUDA_SAFE_CALL( cudaFree(dB));
    CUDA_SAFE_CALL( cudaFree(dC));
    free(hA);
    free(hB);
    free(hC);
    CUT_EXIT(argc, argv);

    return 0;
}

关注我们