用OpenCL解决图像半透明算法的代码讲解-技术开发专区

用OpenCL解决图像半透明算法的代码讲解

作者：zenny_chen 编辑：景保玉 2010-09-27 10:31 来源：cocoach

　　【IT168技术】这个用 OpenCL 实现图像半透明的代码例子由 CocoaChina 版主 “zenny_chen” 提供，请现阅读代码，然后看版主的分析。

/*
*  hello.c
*  OpenCL_init
*
*  Created by Zenny Chen on 9/1/10.
*  Copyright 2010 GreenGames Studio. All rights reserved.
*
*/


#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>


////////////////////////////////////////////////////////////////////////////////

// Use a static data size for simplicity
//
#define IMAGE_X_PIXELS          176
#define IMAGE_Y_PIXELS          144
#define IMAGE_SOURCE1_LIMPID    0.5f
#define IMAGE_SOURCE2_LIMPID    0.5f


////////////////////////////////////////////////////////////////////////////////

// Simple compute kernel which computes the square of an input array
//
const char *KernelSource = "\n" \
"#define IMAGE_Y_PIXELS   144                                           \n" \
"#define IMAGE_SOURCE1_LIMPID   0.5f                                    \n" \
"#define IMAGE_SOURCE2_LIMPID   0.5f                                    \n" \
"                                                                       \n" \
"__kernel void Limpid(                                                  \n" \
"   __global float image1[][IMAGE_Y_PIXELS],                            \n" \
"   __global float image2[][IMAGE_Y_PIXELS],                            \n" \
"   __global float output[][IMAGE_Y_PIXELS])                            \n" \
"{                                                                      \n" \
"   int x = get_global_id(0);                                           \n" \
"   int y = get_global_id(1);                                           \n" \
"   output[x][y] = image1[x][y] * IMAGE_SOURCE1_LIMPID + image2[x][y] * IMAGE_SOURCE2_LIMPID; \n" \
"}                                                                      \n" \
"\n";

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls

float *image1, *image2;             // original data set given to device
    float *results;                     // results returned from device
unsigned int correct;               // number of correct results returned

size_t global;                      // global domain size for our calculation
size_t local;                       // local domain size for our calculation

    cl_platform_id  platform_id;        // added by zenny_chen
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

cl_mem input1, input2;              // device memory used for the input array
cl_mem output;                      // device memory used for the output array

// Initialize the original data buffer and the result buffer
    image1 = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*image1));
    image2 = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*image2));
    results = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*results));
const unsigned int count = IMAGE_X_PIXELS * IMAGE_Y_PIXELS;

// Automatically generate 2 images
    for(int i = 0; i < count; i++) {
        image1[i] = rand() / (float)RAND_MAX;
        image2[i] = rand() / (float)RAND_MAX;
    }

// Create a platform
    err = clGetPlatformIDs(1, &platform_id, NULL);
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to create a platform!\n");
return EXIT_FAILURE;
    }

// Connect to a compute device
    //
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
    }

// Create a compute context
    //
    context = clCreateContext((cl_context_properties[]){(cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0}, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
    }

// Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
    }

// Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
    }

// Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

// Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "Limpid", &err);
    if (!kernel || err != CL_SUCCESS)
    {
printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

// Create the input and output arrays in device memory for our calculation
    //
    input1 = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    input2 = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input1 || !input2 || !output)
    {
printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

// Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input1, CL_TRUE, 0, sizeof(float) * count, image1, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(commands, input2, CL_TRUE, 0, sizeof(float) * count, image2, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to write to source array!\n");
        exit(1);
    }

// Set the arguments to our compute kernel
    //
    err = 0;
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

// Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    else
printf("The number of work items in a work group is: %lu\r\n", local);

// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
    //
    global = count;

    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t[]){IMAGE_X_PIXELS, IMAGE_Y_PIXELS}, (size_t[]){22, 12}, 0, NULL, NULL);
    if (err)
    {
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
    }

// Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);

// Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

// Validate our results
    //
    correct = 0;
    for(int i = 0; i < count; i++)
    {
        if(results[i] == image1[i] * IMAGE_SOURCE1_LIMPID  + image2[i] * IMAGE_SOURCE2_LIMPID)
            correct++;
    }

// Print a brief summary detailing the results
    //
printf("Computed '%d/%d' correct values!\n", correct, count);

// Shutdown and cleanup
    //
clReleaseMemObject(input1);
clReleaseMemObject(input2);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);

    free(image1);
    free(image2);
    free(results);

    return 0;
}

　　分析：

　　这里输入有两个缓存数组，并且用了二维工作维度。我们首先看一下Limpid内核计算函数。

　　OpenCL中的C语言是ISO/IEC9899:1999标准的子集，因此兼容大多数C99的语法特性。我们在定义一个二维数组的时候必须要指定列数，函数可以忽略。这是C语言的一个特性。

　　这里的Limpid内核函数有两个输入数组，分别用来存放两个图像数据。我们分别对两个图像相应的每对像素做插值混合计算，将结果像素数据送给输出缓存。这里的像素是用一个单精度浮点来表示的;当然，我们也可以看作为是由4个单精度浮点组成。

　　我们通过调用get_global_id()来获得当前工作项第i维的id，对于二维工作组，每个二维ID(id_x, id_y)用于唯一标识一个工作项线程所对应的存储单元。对于二维工作组而言，第0维表示数据的列索引;而第1维则表示数据的行索引。如果我们将二维数组用一维索引来表示的话就是id_x + id_y * x_length;x_length表示一行有多少列。

　　我们分别获得当前行、列索引就能确定唯一的存储单元。我们可以对此进行操作。

　　然后我们再看第213行，对clEnqueueNDRangeKernel函数的调用。这里我们指定工作维度是2。然后，全局工作项的个数就分别是图像长度和高度;而一个工作组的大小这里是需要好好讲一下的。

　　由于我们之前通过clGetKernelWorkGroupInfo函数获得的工作组最大大小是512，而如果我们只用一个工作组的话就需要176 * 144个工作项，这个乘积已经大大超过512了。因此，我们可以对176 * 144进行划分，使得每个工作组的实际大小小于512。我们这里对一个工作组的行取了22，列取了12。那么对于二维工作组而言，工作组的个数就被划分为 (8, 12)个。如果将整个计算资源看作为一个大的容器的话，那么它就有8 * 12个工作组;每个工作组有22 * 12个工作项。

　　如果各位有CUDA编程的一些概念的话，那么上面所说的工作组其实对应于Block;而上面所说的容器，其实就是一个Grid。这里再要提醒一下的是全局工作项的每个维度的个数都必须能够被所划分后的一个工作组的每个维度的工作项个数整除。

　　我们在主机端做数据组织的话可以不用考虑维度，都用一维的也没问题。包括数据创建以及读写。

关注我们