技术开发 频道

基于点集的Balance kd-tree的并行构建

  最后贴上内核源代码:

  1 CODE:
  2 class prefixOperator
  3 {
  4 public:
  5     inline __device__ void init()
  6     {
  7         lane=threadIdx.x&31u;
  8         warpid=threadIdx.x>>5;
  9     }
10     inline __device__ uint inclusive( uint* smem ) const
11     {
12         uint pref=warpOp( smem ); __syncthreads();
13
14         if( lane==31 ){
15             smem[ warpid ]=pref;
16         } __syncthreads();
17
18         for( uint n=1; n<( blockDim.x>>5 ); n<<=1 ){
19             if( threadIdx.x>=n ){
20                 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
21             }
22         } __syncthreads();
23
24          if( warpid>0 ){
25         pref+=smem[ warpid-1 ];
26          } __syncthreads();
27
28          return pref;
29     }
30
31     inline __device__ void exclusive( uint* smem )
32     {
33         uint pref=warpOp( smem ); __syncthreads();
34
35         if( lane==31 ){
36             smem[ warpid ]=pref;
37         } __syncthreads();
38
39         for( uint n=1; n<( blockDim.x>>5 ); n<<=1 )
40         {
41             if( threadIdx.x>=n ){
42                 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
43             }
44         } __syncthreads();
45
46         if( warpid>0 ){
47             pref+=smem[ warpid-1 ];
48         } __syncthreads();
49
50         ( threadIdx.x<( blockDim.x-1 ) )?( smem[ threadIdx.x+1 ]=pref ):( smem[ 0 ]=0 );
51         __syncthreads();
52     }
53
54 private:
55     inline __device__ uint warpOp( uint* smem ) const
56     {
57         if( lane>= 1 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 1 ]; }
58         if( lane>= 2 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 2 ]; }
59         if( lane>= 4 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 4 ]; }
60         if( lane>= 8 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 8 ]; }
61         if( lane>=16 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x-16 ]; }
62         return smem[ threadIdx.x ];
63     }
64
65     uint lane;
66     uint warpid;
67 };
68
69 extern "C"
70 {
71
72 //从这个内核到kernel_scatter是排序用的内核,采用基数排序
73 __global__
74 void kernel_scan_blocks( uint* prefix, uint* block_suffix, const uint* list, uint slice_size )
75 {
76     __shared__ uint smem[ MAX_CTA_DIM ];
77
78                 const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
79                 const uint active=tidx<slice_size;
80                 const uint idx=IM( slice_size, blockIdx.y )+tidx;
81
82         smem[ threadIdx.x ]=active?list[ idx ]:0;        
83         
84         prefixOperator scanner;
85         scanner.init();
86
87         const uint pref=scanner.inclusive( smem );
88
89         if( active ){
90                 prefix[ idx ]=pref;
91         }
92
93         if( threadIdx.x==( blockDim.x-1 ) ){
94                 block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=pref;
95         }
96 }
97
98 __global__
99 void kernel_scan_block( uint* prefix, const uint* list, uint slice_size )
100 {
101         extern __shared__ uint smem[];
102
103         const uint active=threadIdx.x<slice_size;
104                 const uint idx=IM( slice_size, blockIdx.y )+threadIdx.x;
105
106         smem[ threadIdx.x ]=active?list[ idx ]:0;
107
108         prefixOperator scanner;
109         scanner.init();
110
111         const uint pref=scanner.inclusive( smem );
112
113         if( active ){
114                 prefix[ idx ]=pref;
115         }
116 }
117
118 __global__
119 void kernel_correct( uint* prefix, const uint* block_suffix, uint slice_size )
120 {
121         __shared__ uint s_pref;
122
123         if( threadIdx.x==0 ){
124                 s_pref=( blockIdx.x>0 )?block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x-1 ]:0;
125         } __syncthreads();
126
127         const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
128
129         if( tidx<slice_size ){
130                 prefix[ IM( slice_size, blockIdx.y )+tidx ]+=s_pref;
131         }
132 }
133
134 __global__
135 void kernel_split( uint* slice,                                  
136                 uint* track,
137                         uint* split,                                  
138                 uint  level,
139                 uint  shift,
140                 uint  slice_size )
141 {
142         __shared__ uint s_split;
143         __shared__ uint s_mask[ MAX_CTA_DIM ];
144
145         uint const base=IM( MAX_CTA_DIM, blockIdx.x );
146         uint const tidx=base+threadIdx.x;
147                 uint const active=tidx<slice_size;
148         uint const area=IM( slice_size, blockIdx.y );
149         uint const gidx=area+tidx;
150         uint const d=slice_size-base;
151         uint const mask=( d>=blockDim.x )?~0u:0u;
152         uint const last=( mask&blockDim.x )+( ( ~mask )&d );
153         uint const axis=level%3;
154         uint const level_size=IM( 1u<<level, slice_size );
155         uint stride=IM( axis, level_size );
156
157         uint const k=active?track[ gidx+stride ]:~0u;
158         uint const b=bit( k, shift );
159         s_mask[ threadIdx.x ]=1u^b;
160
161         prefixOperator scanner;
162         scanner.init();
163
164         scanner.exclusive( s_mask );
165
166         if( threadIdx.x==( last-1 ) ){
167                 s_split=s_mask[ threadIdx.x ]+( 1u^b );
168         } __syncthreads();
169
170         if( active ){
171
172                 uint idx=threadIdx.x-s_mask[ threadIdx.x ]+s_split;
173                 idx=b?idx:s_mask[ threadIdx.x ];
174                 idx+=( base+area );
175
176                 slice[ idx+stride ]=slice[ gidx+stride ];
177                 track[ idx+stride ]=k;                    
178                 stride=IM( ( axis+1 )%3, level_size );
179                 track[ idx+stride ]=track[ gidx+stride ];
180                 stride=IM( ( axis+2 )%3, level_size );
181                 track[ idx+stride ]=track[ gidx+stride ];
182         }
183
184         if( threadIdx.x==0 ){
185                 split[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=s_split;
186         }
187 }
188
189 __global__
190 void kernel_scatter( uint* target_slice,
191                     uint* target_track,
192             const uint* source_slice,
193             const uint* source_track,
194             const uint* local_split,
195             const uint* global_split,
196                     uint   level,
197                     uint   slice_size )
198 {        
199         __shared__ uint s_local_split;
200         __shared__ uint s_global_split;
201         __shared__ uint s_start;
202         __shared__ uint s_prefix;
203         __shared__ uint s_level_size;
204
205         if( threadIdx.x==0 ){
206                 const uint ctaid=IM( gridDim.x, blockIdx.y )+blockIdx.x;
207
208                 s_local_split=local_split[ ctaid ];
209                 s_global_split=global_split[ ctaid ];
210                 s_start=( blockIdx.x>0 )?global_split[ ctaid-1 ]:0;
211                 s_prefix=global_split[ IM( gridDim.x, blockIdx.y+1 )-1 ];
212                 s_level_size=IM( 1u<<level, slice_size );
213
214         } __syncthreads();
215
216         uint iidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
217
218         if( iidx<slice_size ){
219                 uint oidx=( threadIdx.x<s_local_split )?( s_start+threadIdx.x ):( s_prefix+iidx-s_global_split );
220                 oidx+=IM( slice_size, blockIdx.y );
221
222                 target_slice[ oidx ]=source_slice[ iidx ];
223                 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
224                 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
225                 target_track[ oidx ]=source_track[ iidx ];
226         }
227 }
228
229 //分裂slice
230
231 __global__ void kernel_divide( uint* target_slice,
232                           uint* target_track,
233                                  const uint* source_slice,
234                                  const uint* source_track,
235                                           uint  target_slice_size,
236                                           uint  target_level_size,
237                                   uint  source_slice_size,
238                                   uint  source_level_size )
239 {
240         uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
241
242         if( tidx<target_level_size ){
243         
244                 const uint slice_id=tidx/target_slice_size;
245                 const uint b=slice_id&1u;                
246                 const uint slot=tidx%target_slice_size;
247                 uint idx=IM( source_slice_size, slice_id>>1 )+IM( b, ( source_slice_size+1 )>>1 )+slot-b;
248
249                 target_slice[ tidx ]=source_slice[ idx ];
250                 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
251                 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
252                 target_track[ tidx ]=source_track[ idx ];
253
254         }
255 }
256
257 //编码当前层的节点,
258 //2个32位的元素
259 //对于第一个元素:
260 //kdinnerNode :
261 //最低位表示是否是叶节点
262 //第2:1位表示该层的slplit palne的维度(0,1,或者2)
263 //第31:3的29个位表示节点在索引数组中的偏移
264 //第二个元素是分割面在其所属层的维度上的坐标
265 //kdleafNode:
266 //对于第一个元素:
267 //最低位表示是否是页节点
268 //31:1位存储了改节点中包含的原书数量信息
269 //第二个元素是元素的索引
270
271 __global__
272 void kernel_code_level( uint2* kdnode,
273                          const uint*  track,
274                   uint   level,
275                   uint   slice_size,
276                   uint   level_size )
277 {
278         uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
279         uint const hPos=IM( slice_size, tidx )+( slice_size>>1 );
280         uint const axis=level%3;
281         uint const idx=IM( axis, level_size )+hPos;
282         uint const split=track[ idx ];
283
284         uint2 node;
285         tidx+=( ( 1u<<level )-1 );
286         node.x =axis;
287         node.x|=( 1u<<31 );
288         node.x|=( ( ( tidx<<1 )+1 )<<2 );
289         node.y =split;
290
291         kdnode[ tidx ]=node;
292 }
293
294 //当每个slice中的(由于我们采用的划分策略,所以保证每层中的所有slice的尺寸都是相同的)
295 //元素数量小于等于最大block尺寸减去一个小于等于最大block尺寸的一半的一个数量后(其实
296 //并不一定需要如前面所说的那样必须在小与或等于1最大block尺寸的一半时,这样说只是为了简单
297 //至于应该在什么时候确定剩下的层构建可以在一次内核调用中单独完成,可以像前面计算最大存储空间
298 //的扩展长度那样确定,对slice的分裂和编码操作可以在一个内核中单独完成,而不用在主机端的我循环中计算。
299
300 __constant__ uint c_shift[]={
301 #if MAX_CTA_DIM==1024
302 9,
303 #endif
304 8, 7, 6, 5 };
305
306 __global__
307 void kernel_code_last_levels( uint2* kdnode,
308                          uint*  index,
309                  const uint*  track,
310                           uint   start_level,
311                           uint   slice_size )
312 {        
313         __shared__ uint  s_index[ MAX_CTA_DIM ];
314         __shared__ float s_coord[ 3 ][ MAX_CTA_DIM ];
315
316         uint idx=IM( slice_size, blockIdx.x )+threadIdx.x;
317         uint active=threadIdx.x<slice_size;
318         uint level=start_level;
319         uint stride=IM( 1u<<level, slice_size );
320
321         if( active ){
322                 s_index[ threadIdx.x ]=index[ idx ];
323         }
324         s_coord[ 0 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
325         s_coord[ 1 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
326         s_coord[ 2 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT;
327
328                 uint const tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
329         uint active_size=slice_size;
330         uint fiber_id=threadIdx.x;
331         uint iBit=0;
332         uint layout=blockDim.x;
333         uint slice_local_id=0;
334                 stride=( 1u<<level )-1;
335
336         while( layout>32 ){ __syncthreads();
337
338                 uint const axis=level%3;
339                 uint const addr=IM( slice_local_id, layout );
340                 bitnotic_sort( s_index, s_coord, addr, fiber_id, layout, axis );        
341         
342                 if( fiber_id==0 ){
343                         uint2 node;
344                         uint const hloc=addr+( active_size>>1 );
345                         uint const iOut=stride+( tidx>>c_shift[ iBit ] );
346                         node.x=axis|( 1u<<31 )|( ( ( ( iOut<<1 )+1 )<<2 )&0x7ffffffcU );
347                         node.y=__float_as_uint( s_coord[ axis ][ hloc ] );
348                         kdnode[ iOut ]=node;
349                 }
350
351                 layout>>=1; ++iBit;
352                 slice_local_id=threadIdx.x>>c_shift[ iBit ];
353                 uint const b=slice_local_id&1u;
354                 uint const e=active_size&1u;
355                 active_size=( ( active_size+1 )>>1 )+( active_size&1u );
356                 fiber_id=threadIdx.x&( layout-1 );
357                 idx=addr+IM( slice_local_id&1u, active_size )+fiber_id-IM( b, e );
358                                 active=fiber_id<active_size;
359
360                 uint  const temp_i=s_index[ idx ];
361                 float const temp_x=active?s_coord[ 0 ][ idx ]:MAX_IFLOAT;
362                 float const temp_y=active?s_coord[ 1 ][ idx ]:MAX_IFLOAT;
363                 float const temp_z=active?s_coord[ 2 ][ idx ]:MAX_IFLOAT;
364                 __syncthreads();
365
366                 s_index       [ threadIdx.x ]=temp_i;
367                 s_coord[ 0 ][ threadIdx.x ]=temp_x;
368                 s_coord[ 1 ][ threadIdx.x ]=temp_y;
369                 s_coord[ 2 ][ threadIdx.x ]=temp_z;
370
371                 stride+=( 1<<level );
372                 ++level;
373
374         } __syncthreads();
375
376         if( fiber_id<active_size ){        
377                 const uint slice_id=tidx>>c_shift[ iBit ];
378
379                 if( fiber_id==0 ){
380                     uint2 leaf;
381                     leaf.x=active_size&0x7fffffffU;
382                     leaf.y=IM( slice_id, active_size );
383                     kdnode[ stride+slice_id ]=leaf;
384                 }
385                 const uint iOut=IM( slice_id, active_size )+fiber_id;
386                 index[ iOut ]=s_index[ threadIdx.x ];
387         }
388 }
389 }
0
相关文章