【IT168 技术】使用.NET平台调用函数是一件容易的事情,但有一件事需要注意 — 访问的可变性,因为我们不能在它们上面使用DllImport属性,我们必须找到变量的地址,然后排列数据。
using System.Runtime.InteropServices;
#region hard way to import variable from unmanaged dll
[DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Ansi)]
internal static extern IntPtr GetProcAddress(IntPtr hModule, string procName);
[DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Ansi)]
internal static extern IntPtr LoadLibrary(string lpszLib);
static float ReadsExecutionTime()
{
IntPtr hdl = LoadLibrary("cudalib.dll");
if (hdl != IntPtr.Zero)
{
IntPtr addr = GetProcAddress(hdl, "sExecutionTime");
if (addr != IntPtr.Zero)
{
//int value = Marshal.ReadInt32(addr); // for integer types
float[] managedArray = new float[1]; // single value
Marshal.Copy(addr, managedArray, 0, 1); // for other types
return managedArray[0];
}
}
return 0;
}
#endregion
// easy way to import variable from unmanaged dll - make wrapper if you can
[DllImport("cudalib.dll", CharSet = CharSet.Ansi, SetLastError = true, CallingConvention = CallingConvention.StdCall)]
public static extern float GetExecutionTime();
#region hard way to import variable from unmanaged dll
[DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Ansi)]
internal static extern IntPtr GetProcAddress(IntPtr hModule, string procName);
[DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Ansi)]
internal static extern IntPtr LoadLibrary(string lpszLib);
static float ReadsExecutionTime()
{
IntPtr hdl = LoadLibrary("cudalib.dll");
if (hdl != IntPtr.Zero)
{
IntPtr addr = GetProcAddress(hdl, "sExecutionTime");
if (addr != IntPtr.Zero)
{
//int value = Marshal.ReadInt32(addr); // for integer types
float[] managedArray = new float[1]; // single value
Marshal.Copy(addr, managedArray, 0, 1); // for other types
return managedArray[0];
}
}
return 0;
}
#endregion
// easy way to import variable from unmanaged dll - make wrapper if you can
[DllImport("cudalib.dll", CharSet = CharSet.Ansi, SetLastError = true, CallingConvention = CallingConvention.StdCall)]
public static extern float GetExecutionTime();
使用整型事情会变得更简单,因为你可能向上面注释中那样使用Marshal类。对其它类型,Marshal.Copy()方法是个可行的解决方案。我使用单元素数组浮点数检索我的变量,Marshal.Copy()可能也会从托管源拷贝数据到非托管目标(反之亦然)。
至于dll中的其它函数,还包括CUDA计算函数:
[DllImport("cudalib.dll", CharSet = CharSet.Ansi, SetLastError = true, CallingConvention = CallingConvention.StdCall)]
public static extern int SomeCalculationsCU(float[] a_h, uint N, uint M, int cuBlockSize, int showErrors);
[DllImport("cudalib.dll", CharSet = CharSet.Ansi, SetLastError = true, CallingConvention = CallingConvention.StdCall)]
public static extern void SomeCalculationsCPU(float[] a_h, uint N, uint M);
public static extern int SomeCalculationsCU(float[] a_h, uint N, uint M, int cuBlockSize, int showErrors);
[DllImport("cudalib.dll", CharSet = CharSet.Ansi, SetLastError = true, CallingConvention = CallingConvention.StdCall)]
public static extern void SomeCalculationsCPU(float[] a_h, uint N, uint M);
再提醒一次,记住正确设置调用约定。
示例代码也展示了如何使用Parallel.For()方法,它属于.NET 4的功能。
using System.Threading.Tasks;
using System.Diagnostics;
private static double ParallelForVersion(float[] farr3, uint N, uint M)
{
Stopwatch stp = new Stopwatch();
stp.Start();
Parallel.For(0, N, i =>
{
for (uint j = 0; j < M; j++)
farr3[i] = farr3[i] * farr3[i] * 0.1f - farr3[i] - 10;
});
stp.Stop();
return stp.Elapsed.TotalMilliseconds;
}
using System.Diagnostics;
private static double ParallelForVersion(float[] farr3, uint N, uint M)
{
Stopwatch stp = new Stopwatch();
stp.Start();
Parallel.For(0, N, i =>
{
for (uint j = 0; j < M; j++)
farr3[i] = farr3[i] * farr3[i] * 0.1f - farr3[i] - 10;
});
stp.Stop();
return stp.Elapsed.TotalMilliseconds;
}
这个代码做CUDA版本完全一样的计算,但是,你将会看到计算结果却不完全一样,因为CPU和GPU计算的错误边界不同。这里也给nvcc编译器使用了use_fast_math参数,强制它使用不精确,但更快的函数,如用__sinf(x)代替sinf(x),不是每个函数都有前缀为“__”的副本。