最后贴上内核源代码:
1 CODE:
2 class prefixOperator
3 {
4 public:
5 inline __device__ void init()
6 {
7 lane=threadIdx.x&31u;
8 warpid=threadIdx.x>>5;
9 }
10 inline __device__ uint inclusive( uint* smem ) const
11 {
12 uint pref=warpOp( smem ); __syncthreads();
13
14 if( lane==31 ){
15 smem[ warpid ]=pref;
16 } __syncthreads();
17
18 for( uint n=1; n<( blockDim.x>>5 ); n<<=1 ){
19 if( threadIdx.x>=n ){
20 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
21 }
22 } __syncthreads();
23
24 if( warpid>0 ){
25 pref+=smem[ warpid-1 ];
26 } __syncthreads();
27
28 return pref;
29 }
30
31 inline __device__ void exclusive( uint* smem )
32 {
33 uint pref=warpOp( smem ); __syncthreads();
34
35 if( lane==31 ){
36 smem[ warpid ]=pref;
37 } __syncthreads();
38
39 for( uint n=1; n<( blockDim.x>>5 ); n<<=1 )
40 {
41 if( threadIdx.x>=n ){
42 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
43 }
44 } __syncthreads();
45
46 if( warpid>0 ){
47 pref+=smem[ warpid-1 ];
48 } __syncthreads();
49
50 ( threadIdx.x<( blockDim.x-1 ) )?( smem[ threadIdx.x+1 ]=pref ):( smem[ 0 ]=0 );
51 __syncthreads();
52 }
53
54 private:
55 inline __device__ uint warpOp( uint* smem ) const
56 {
57 if( lane>= 1 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 1 ]; }
58 if( lane>= 2 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 2 ]; }
59 if( lane>= 4 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 4 ]; }
60 if( lane>= 8 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 8 ]; }
61 if( lane>=16 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x-16 ]; }
62 return smem[ threadIdx.x ];
63 }
64
65 uint lane;
66 uint warpid;
67 };
68
69 extern "C"
70 {
71
72 //从这个内核到kernel_scatter是排序用的内核,采用基数排序
73 __global__
74 void kernel_scan_blocks( uint* prefix, uint* block_suffix, const uint* list, uint slice_size )
75 {
76 __shared__ uint smem[ MAX_CTA_DIM ];
77
78 const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
79 const uint active=tidx<slice_size;
80 const uint idx=IM( slice_size, blockIdx.y )+tidx;
81
82 smem[ threadIdx.x ]=active?list[ idx ]:0;
83
84 prefixOperator scanner;
85 scanner.init();
86
87 const uint pref=scanner.inclusive( smem );
88
89 if( active ){
90 prefix[ idx ]=pref;
91 }
92
93 if( threadIdx.x==( blockDim.x-1 ) ){
94 block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=pref;
95 }
96 }
97
98 __global__
99 void kernel_scan_block( uint* prefix, const uint* list, uint slice_size )
100 {
101 extern __shared__ uint smem[];
102
103 const uint active=threadIdx.x<slice_size;
104 const uint idx=IM( slice_size, blockIdx.y )+threadIdx.x;
105
106 smem[ threadIdx.x ]=active?list[ idx ]:0;
107
108 prefixOperator scanner;
109 scanner.init();
110
111 const uint pref=scanner.inclusive( smem );
112
113 if( active ){
114 prefix[ idx ]=pref;
115 }
116 }
117
118 __global__
119 void kernel_correct( uint* prefix, const uint* block_suffix, uint slice_size )
120 {
121 __shared__ uint s_pref;
122
123 if( threadIdx.x==0 ){
124 s_pref=( blockIdx.x>0 )?block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x-1 ]:0;
125 } __syncthreads();
126
127 const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
128
129 if( tidx<slice_size ){
130 prefix[ IM( slice_size, blockIdx.y )+tidx ]+=s_pref;
131 }
132 }
133
134 __global__
135 void kernel_split( uint* slice,
136 uint* track,
137 uint* split,
138 uint level,
139 uint shift,
140 uint slice_size )
141 {
142 __shared__ uint s_split;
143 __shared__ uint s_mask[ MAX_CTA_DIM ];
144
145 uint const base=IM( MAX_CTA_DIM, blockIdx.x );
146 uint const tidx=base+threadIdx.x;
147 uint const active=tidx<slice_size;
148 uint const area=IM( slice_size, blockIdx.y );
149 uint const gidx=area+tidx;
150 uint const d=slice_size-base;
151 uint const mask=( d>=blockDim.x )?~0u:0u;
152 uint const last=( mask&blockDim.x )+( ( ~mask )&d );
153 uint const axis=level%3;
154 uint const level_size=IM( 1u<<level, slice_size );
155 uint stride=IM( axis, level_size );
156
157 uint const k=active?track[ gidx+stride ]:~0u;
158 uint const b=bit( k, shift );
159 s_mask[ threadIdx.x ]=1u^b;
160
161 prefixOperator scanner;
162 scanner.init();
163
164 scanner.exclusive( s_mask );
165
166 if( threadIdx.x==( last-1 ) ){
167 s_split=s_mask[ threadIdx.x ]+( 1u^b );
168 } __syncthreads();
169
170 if( active ){
171
172 uint idx=threadIdx.x-s_mask[ threadIdx.x ]+s_split;
173 idx=b?idx:s_mask[ threadIdx.x ];
174 idx+=( base+area );
175
176 slice[ idx+stride ]=slice[ gidx+stride ];
177 track[ idx+stride ]=k;
178 stride=IM( ( axis+1 )%3, level_size );
179 track[ idx+stride ]=track[ gidx+stride ];
180 stride=IM( ( axis+2 )%3, level_size );
181 track[ idx+stride ]=track[ gidx+stride ];
182 }
183
184 if( threadIdx.x==0 ){
185 split[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=s_split;
186 }
187 }
188
189 __global__
190 void kernel_scatter( uint* target_slice,
191 uint* target_track,
192 const uint* source_slice,
193 const uint* source_track,
194 const uint* local_split,
195 const uint* global_split,
196 uint level,
197 uint slice_size )
198 {
199 __shared__ uint s_local_split;
200 __shared__ uint s_global_split;
201 __shared__ uint s_start;
202 __shared__ uint s_prefix;
203 __shared__ uint s_level_size;
204
205 if( threadIdx.x==0 ){
206 const uint ctaid=IM( gridDim.x, blockIdx.y )+blockIdx.x;
207
208 s_local_split=local_split[ ctaid ];
209 s_global_split=global_split[ ctaid ];
210 s_start=( blockIdx.x>0 )?global_split[ ctaid-1 ]:0;
211 s_prefix=global_split[ IM( gridDim.x, blockIdx.y+1 )-1 ];
212 s_level_size=IM( 1u<<level, slice_size );
213
214 } __syncthreads();
215
216 uint iidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
217
218 if( iidx<slice_size ){
219 uint oidx=( threadIdx.x<s_local_split )?( s_start+threadIdx.x ):( s_prefix+iidx-s_global_split );
220 oidx+=IM( slice_size, blockIdx.y );
221
222 target_slice[ oidx ]=source_slice[ iidx ];
223 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
224 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
225 target_track[ oidx ]=source_track[ iidx ];
226 }
227 }
228
229 //分裂slice
230
231 __global__ void kernel_divide( uint* target_slice,
232 uint* target_track,
233 const uint* source_slice,
234 const uint* source_track,
235 uint target_slice_size,
236 uint target_level_size,
237 uint source_slice_size,
238 uint source_level_size )
239 {
240 uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
241
242 if( tidx<target_level_size ){
243
244 const uint slice_id=tidx/target_slice_size;
245 const uint b=slice_id&1u;
246 const uint slot=tidx%target_slice_size;
247 uint idx=IM( source_slice_size, slice_id>>1 )+IM( b, ( source_slice_size+1 )>>1 )+slot-b;
248
249 target_slice[ tidx ]=source_slice[ idx ];
250 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
251 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
252 target_track[ tidx ]=source_track[ idx ];
253
254 }
255 }
256
257 //编码当前层的节点,
258 //2个32位的元素
259 //对于第一个元素:
260 //kdinnerNode :
261 //最低位表示是否是叶节点
262 //第2:1位表示该层的slplit palne的维度(0,1,或者2)
263 //第31:3的29个位表示节点在索引数组中的偏移
264 //第二个元素是分割面在其所属层的维度上的坐标
265 //kdleafNode:
266 //对于第一个元素:
267 //最低位表示是否是页节点
268 //31:1位存储了改节点中包含的原书数量信息
269 //第二个元素是元素的索引
270
271 __global__
272 void kernel_code_level( uint2* kdnode,
273 const uint* track,
274 uint level,
275 uint slice_size,
276 uint level_size )
277 {
278 uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
279 uint const hPos=IM( slice_size, tidx )+( slice_size>>1 );
280 uint const axis=level%3;
281 uint const idx=IM( axis, level_size )+hPos;
282 uint const split=track[ idx ];
283
284 uint2 node;
285 tidx+=( ( 1u<<level )-1 );
286 node.x =axis;
287 node.x|=( 1u<<31 );
288 node.x|=( ( ( tidx<<1 )+1 )<<2 );
289 node.y =split;
290
291 kdnode[ tidx ]=node;
292 }
293
294 //当每个slice中的(由于我们采用的划分策略,所以保证每层中的所有slice的尺寸都是相同的)
295 //元素数量小于等于最大block尺寸减去一个小于等于最大block尺寸的一半的一个数量后(其实
296 //并不一定需要如前面所说的那样必须在小与或等于1最大block尺寸的一半时,这样说只是为了简单
297 //至于应该在什么时候确定剩下的层构建可以在一次内核调用中单独完成,可以像前面计算最大存储空间
298 //的扩展长度那样确定,对slice的分裂和编码操作可以在一个内核中单独完成,而不用在主机端的我循环中计算。
299
300 __constant__ uint c_shift[]={
301 #if MAX_CTA_DIM==1024
302 9,
303 #endif
304 8, 7, 6, 5 };
305
306 __global__
307 void kernel_code_last_levels( uint2* kdnode,
308 uint* index,
309 const uint* track,
310 uint start_level,
311 uint slice_size )
312 {
313 __shared__ uint s_index[ MAX_CTA_DIM ];
314 __shared__ float s_coord[ 3 ][ MAX_CTA_DIM ];
315
316 uint idx=IM( slice_size, blockIdx.x )+threadIdx.x;
317 uint active=threadIdx.x<slice_size;
318 uint level=start_level;
319 uint stride=IM( 1u<<level, slice_size );
320
321 if( active ){
322 s_index[ threadIdx.x ]=index[ idx ];
323 }
324 s_coord[ 0 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
325 s_coord[ 1 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
326 s_coord[ 2 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT;
327
328 uint const tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
329 uint active_size=slice_size;
330 uint fiber_id=threadIdx.x;
331 uint iBit=0;
332 uint layout=blockDim.x;
333 uint slice_local_id=0;
334 stride=( 1u<<level )-1;
335
336 while( layout>32 ){ __syncthreads();
337
338 uint const axis=level%3;
339 uint const addr=IM( slice_local_id, layout );
340 bitnotic_sort( s_index, s_coord, addr, fiber_id, layout, axis );
341
342 if( fiber_id==0 ){
343 uint2 node;
344 uint const hloc=addr+( active_size>>1 );
345 uint const iOut=stride+( tidx>>c_shift[ iBit ] );
346 node.x=axis|( 1u<<31 )|( ( ( ( iOut<<1 )+1 )<<2 )&0x7ffffffcU );
347 node.y=__float_as_uint( s_coord[ axis ][ hloc ] );
348 kdnode[ iOut ]=node;
349 }
350
351 layout>>=1; ++iBit;
352 slice_local_id=threadIdx.x>>c_shift[ iBit ];
353 uint const b=slice_local_id&1u;
354 uint const e=active_size&1u;
355 active_size=( ( active_size+1 )>>1 )+( active_size&1u );
356 fiber_id=threadIdx.x&( layout-1 );
357 idx=addr+IM( slice_local_id&1u, active_size )+fiber_id-IM( b, e );
358 active=fiber_id<active_size;
359
360 uint const temp_i=s_index[ idx ];
361 float const temp_x=active?s_coord[ 0 ][ idx ]:MAX_IFLOAT;
362 float const temp_y=active?s_coord[ 1 ][ idx ]:MAX_IFLOAT;
363 float const temp_z=active?s_coord[ 2 ][ idx ]:MAX_IFLOAT;
364 __syncthreads();
365
366 s_index [ threadIdx.x ]=temp_i;
367 s_coord[ 0 ][ threadIdx.x ]=temp_x;
368 s_coord[ 1 ][ threadIdx.x ]=temp_y;
369 s_coord[ 2 ][ threadIdx.x ]=temp_z;
370
371 stride+=( 1<<level );
372 ++level;
373
374 } __syncthreads();
375
376 if( fiber_id<active_size ){
377 const uint slice_id=tidx>>c_shift[ iBit ];
378
379 if( fiber_id==0 ){
380 uint2 leaf;
381 leaf.x=active_size&0x7fffffffU;
382 leaf.y=IM( slice_id, active_size );
383 kdnode[ stride+slice_id ]=leaf;
384 }
385 const uint iOut=IM( slice_id, active_size )+fiber_id;
386 index[ iOut ]=s_index[ threadIdx.x ];
387 }
388 }
389 }
2 class prefixOperator
3 {
4 public:
5 inline __device__ void init()
6 {
7 lane=threadIdx.x&31u;
8 warpid=threadIdx.x>>5;
9 }
10 inline __device__ uint inclusive( uint* smem ) const
11 {
12 uint pref=warpOp( smem ); __syncthreads();
13
14 if( lane==31 ){
15 smem[ warpid ]=pref;
16 } __syncthreads();
17
18 for( uint n=1; n<( blockDim.x>>5 ); n<<=1 ){
19 if( threadIdx.x>=n ){
20 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
21 }
22 } __syncthreads();
23
24 if( warpid>0 ){
25 pref+=smem[ warpid-1 ];
26 } __syncthreads();
27
28 return pref;
29 }
30
31 inline __device__ void exclusive( uint* smem )
32 {
33 uint pref=warpOp( smem ); __syncthreads();
34
35 if( lane==31 ){
36 smem[ warpid ]=pref;
37 } __syncthreads();
38
39 for( uint n=1; n<( blockDim.x>>5 ); n<<=1 )
40 {
41 if( threadIdx.x>=n ){
42 smem[ threadIdx.x ]+=smem[ threadIdx.x-n ];
43 }
44 } __syncthreads();
45
46 if( warpid>0 ){
47 pref+=smem[ warpid-1 ];
48 } __syncthreads();
49
50 ( threadIdx.x<( blockDim.x-1 ) )?( smem[ threadIdx.x+1 ]=pref ):( smem[ 0 ]=0 );
51 __syncthreads();
52 }
53
54 private:
55 inline __device__ uint warpOp( uint* smem ) const
56 {
57 if( lane>= 1 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 1 ]; }
58 if( lane>= 2 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 2 ]; }
59 if( lane>= 4 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 4 ]; }
60 if( lane>= 8 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x- 8 ]; }
61 if( lane>=16 ){ smem[ threadIdx.x ]+=smem[ threadIdx.x-16 ]; }
62 return smem[ threadIdx.x ];
63 }
64
65 uint lane;
66 uint warpid;
67 };
68
69 extern "C"
70 {
71
72 //从这个内核到kernel_scatter是排序用的内核,采用基数排序
73 __global__
74 void kernel_scan_blocks( uint* prefix, uint* block_suffix, const uint* list, uint slice_size )
75 {
76 __shared__ uint smem[ MAX_CTA_DIM ];
77
78 const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
79 const uint active=tidx<slice_size;
80 const uint idx=IM( slice_size, blockIdx.y )+tidx;
81
82 smem[ threadIdx.x ]=active?list[ idx ]:0;
83
84 prefixOperator scanner;
85 scanner.init();
86
87 const uint pref=scanner.inclusive( smem );
88
89 if( active ){
90 prefix[ idx ]=pref;
91 }
92
93 if( threadIdx.x==( blockDim.x-1 ) ){
94 block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=pref;
95 }
96 }
97
98 __global__
99 void kernel_scan_block( uint* prefix, const uint* list, uint slice_size )
100 {
101 extern __shared__ uint smem[];
102
103 const uint active=threadIdx.x<slice_size;
104 const uint idx=IM( slice_size, blockIdx.y )+threadIdx.x;
105
106 smem[ threadIdx.x ]=active?list[ idx ]:0;
107
108 prefixOperator scanner;
109 scanner.init();
110
111 const uint pref=scanner.inclusive( smem );
112
113 if( active ){
114 prefix[ idx ]=pref;
115 }
116 }
117
118 __global__
119 void kernel_correct( uint* prefix, const uint* block_suffix, uint slice_size )
120 {
121 __shared__ uint s_pref;
122
123 if( threadIdx.x==0 ){
124 s_pref=( blockIdx.x>0 )?block_suffix[ IM( gridDim.x, blockIdx.y )+blockIdx.x-1 ]:0;
125 } __syncthreads();
126
127 const uint tidx=IM( MAX_CTA_DIM, blockIdx.x )+threadIdx.x;
128
129 if( tidx<slice_size ){
130 prefix[ IM( slice_size, blockIdx.y )+tidx ]+=s_pref;
131 }
132 }
133
134 __global__
135 void kernel_split( uint* slice,
136 uint* track,
137 uint* split,
138 uint level,
139 uint shift,
140 uint slice_size )
141 {
142 __shared__ uint s_split;
143 __shared__ uint s_mask[ MAX_CTA_DIM ];
144
145 uint const base=IM( MAX_CTA_DIM, blockIdx.x );
146 uint const tidx=base+threadIdx.x;
147 uint const active=tidx<slice_size;
148 uint const area=IM( slice_size, blockIdx.y );
149 uint const gidx=area+tidx;
150 uint const d=slice_size-base;
151 uint const mask=( d>=blockDim.x )?~0u:0u;
152 uint const last=( mask&blockDim.x )+( ( ~mask )&d );
153 uint const axis=level%3;
154 uint const level_size=IM( 1u<<level, slice_size );
155 uint stride=IM( axis, level_size );
156
157 uint const k=active?track[ gidx+stride ]:~0u;
158 uint const b=bit( k, shift );
159 s_mask[ threadIdx.x ]=1u^b;
160
161 prefixOperator scanner;
162 scanner.init();
163
164 scanner.exclusive( s_mask );
165
166 if( threadIdx.x==( last-1 ) ){
167 s_split=s_mask[ threadIdx.x ]+( 1u^b );
168 } __syncthreads();
169
170 if( active ){
171
172 uint idx=threadIdx.x-s_mask[ threadIdx.x ]+s_split;
173 idx=b?idx:s_mask[ threadIdx.x ];
174 idx+=( base+area );
175
176 slice[ idx+stride ]=slice[ gidx+stride ];
177 track[ idx+stride ]=k;
178 stride=IM( ( axis+1 )%3, level_size );
179 track[ idx+stride ]=track[ gidx+stride ];
180 stride=IM( ( axis+2 )%3, level_size );
181 track[ idx+stride ]=track[ gidx+stride ];
182 }
183
184 if( threadIdx.x==0 ){
185 split[ IM( gridDim.x, blockIdx.y )+blockIdx.x ]=s_split;
186 }
187 }
188
189 __global__
190 void kernel_scatter( uint* target_slice,
191 uint* target_track,
192 const uint* source_slice,
193 const uint* source_track,
194 const uint* local_split,
195 const uint* global_split,
196 uint level,
197 uint slice_size )
198 {
199 __shared__ uint s_local_split;
200 __shared__ uint s_global_split;
201 __shared__ uint s_start;
202 __shared__ uint s_prefix;
203 __shared__ uint s_level_size;
204
205 if( threadIdx.x==0 ){
206 const uint ctaid=IM( gridDim.x, blockIdx.y )+blockIdx.x;
207
208 s_local_split=local_split[ ctaid ];
209 s_global_split=global_split[ ctaid ];
210 s_start=( blockIdx.x>0 )?global_split[ ctaid-1 ]:0;
211 s_prefix=global_split[ IM( gridDim.x, blockIdx.y+1 )-1 ];
212 s_level_size=IM( 1u<<level, slice_size );
213
214 } __syncthreads();
215
216 uint iidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
217
218 if( iidx<slice_size ){
219 uint oidx=( threadIdx.x<s_local_split )?( s_start+threadIdx.x ):( s_prefix+iidx-s_global_split );
220 oidx+=IM( slice_size, blockIdx.y );
221
222 target_slice[ oidx ]=source_slice[ iidx ];
223 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
224 target_track[ oidx ]=source_track[ iidx ]; oidx+=s_level_size; iidx+=s_level_size;
225 target_track[ oidx ]=source_track[ iidx ];
226 }
227 }
228
229 //分裂slice
230
231 __global__ void kernel_divide( uint* target_slice,
232 uint* target_track,
233 const uint* source_slice,
234 const uint* source_track,
235 uint target_slice_size,
236 uint target_level_size,
237 uint source_slice_size,
238 uint source_level_size )
239 {
240 uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
241
242 if( tidx<target_level_size ){
243
244 const uint slice_id=tidx/target_slice_size;
245 const uint b=slice_id&1u;
246 const uint slot=tidx%target_slice_size;
247 uint idx=IM( source_slice_size, slice_id>>1 )+IM( b, ( source_slice_size+1 )>>1 )+slot-b;
248
249 target_slice[ tidx ]=source_slice[ idx ];
250 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
251 target_track[ tidx ]=source_track[ idx ]; tidx+=target_level_size; idx+=source_level_size;
252 target_track[ tidx ]=source_track[ idx ];
253
254 }
255 }
256
257 //编码当前层的节点,
258 //2个32位的元素
259 //对于第一个元素:
260 //kdinnerNode :
261 //最低位表示是否是叶节点
262 //第2:1位表示该层的slplit palne的维度(0,1,或者2)
263 //第31:3的29个位表示节点在索引数组中的偏移
264 //第二个元素是分割面在其所属层的维度上的坐标
265 //kdleafNode:
266 //对于第一个元素:
267 //最低位表示是否是页节点
268 //31:1位存储了改节点中包含的原书数量信息
269 //第二个元素是元素的索引
270
271 __global__
272 void kernel_code_level( uint2* kdnode,
273 const uint* track,
274 uint level,
275 uint slice_size,
276 uint level_size )
277 {
278 uint tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
279 uint const hPos=IM( slice_size, tidx )+( slice_size>>1 );
280 uint const axis=level%3;
281 uint const idx=IM( axis, level_size )+hPos;
282 uint const split=track[ idx ];
283
284 uint2 node;
285 tidx+=( ( 1u<<level )-1 );
286 node.x =axis;
287 node.x|=( 1u<<31 );
288 node.x|=( ( ( tidx<<1 )+1 )<<2 );
289 node.y =split;
290
291 kdnode[ tidx ]=node;
292 }
293
294 //当每个slice中的(由于我们采用的划分策略,所以保证每层中的所有slice的尺寸都是相同的)
295 //元素数量小于等于最大block尺寸减去一个小于等于最大block尺寸的一半的一个数量后(其实
296 //并不一定需要如前面所说的那样必须在小与或等于1最大block尺寸的一半时,这样说只是为了简单
297 //至于应该在什么时候确定剩下的层构建可以在一次内核调用中单独完成,可以像前面计算最大存储空间
298 //的扩展长度那样确定,对slice的分裂和编码操作可以在一个内核中单独完成,而不用在主机端的我循环中计算。
299
300 __constant__ uint c_shift[]={
301 #if MAX_CTA_DIM==1024
302 9,
303 #endif
304 8, 7, 6, 5 };
305
306 __global__
307 void kernel_code_last_levels( uint2* kdnode,
308 uint* index,
309 const uint* track,
310 uint start_level,
311 uint slice_size )
312 {
313 __shared__ uint s_index[ MAX_CTA_DIM ];
314 __shared__ float s_coord[ 3 ][ MAX_CTA_DIM ];
315
316 uint idx=IM( slice_size, blockIdx.x )+threadIdx.x;
317 uint active=threadIdx.x<slice_size;
318 uint level=start_level;
319 uint stride=IM( 1u<<level, slice_size );
320
321 if( active ){
322 s_index[ threadIdx.x ]=index[ idx ];
323 }
324 s_coord[ 0 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
325 s_coord[ 1 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT; idx+=stride;
326 s_coord[ 2 ][ threadIdx.x ]=active?__uint_as_float( track[ idx ] ):MAX_IFLOAT;
327
328 uint const tidx=IM( blockDim.x, blockIdx.x )+threadIdx.x;
329 uint active_size=slice_size;
330 uint fiber_id=threadIdx.x;
331 uint iBit=0;
332 uint layout=blockDim.x;
333 uint slice_local_id=0;
334 stride=( 1u<<level )-1;
335
336 while( layout>32 ){ __syncthreads();
337
338 uint const axis=level%3;
339 uint const addr=IM( slice_local_id, layout );
340 bitnotic_sort( s_index, s_coord, addr, fiber_id, layout, axis );
341
342 if( fiber_id==0 ){
343 uint2 node;
344 uint const hloc=addr+( active_size>>1 );
345 uint const iOut=stride+( tidx>>c_shift[ iBit ] );
346 node.x=axis|( 1u<<31 )|( ( ( ( iOut<<1 )+1 )<<2 )&0x7ffffffcU );
347 node.y=__float_as_uint( s_coord[ axis ][ hloc ] );
348 kdnode[ iOut ]=node;
349 }
350
351 layout>>=1; ++iBit;
352 slice_local_id=threadIdx.x>>c_shift[ iBit ];
353 uint const b=slice_local_id&1u;
354 uint const e=active_size&1u;
355 active_size=( ( active_size+1 )>>1 )+( active_size&1u );
356 fiber_id=threadIdx.x&( layout-1 );
357 idx=addr+IM( slice_local_id&1u, active_size )+fiber_id-IM( b, e );
358 active=fiber_id<active_size;
359
360 uint const temp_i=s_index[ idx ];
361 float const temp_x=active?s_coord[ 0 ][ idx ]:MAX_IFLOAT;
362 float const temp_y=active?s_coord[ 1 ][ idx ]:MAX_IFLOAT;
363 float const temp_z=active?s_coord[ 2 ][ idx ]:MAX_IFLOAT;
364 __syncthreads();
365
366 s_index [ threadIdx.x ]=temp_i;
367 s_coord[ 0 ][ threadIdx.x ]=temp_x;
368 s_coord[ 1 ][ threadIdx.x ]=temp_y;
369 s_coord[ 2 ][ threadIdx.x ]=temp_z;
370
371 stride+=( 1<<level );
372 ++level;
373
374 } __syncthreads();
375
376 if( fiber_id<active_size ){
377 const uint slice_id=tidx>>c_shift[ iBit ];
378
379 if( fiber_id==0 ){
380 uint2 leaf;
381 leaf.x=active_size&0x7fffffffU;
382 leaf.y=IM( slice_id, active_size );
383 kdnode[ stride+slice_id ]=leaf;
384 }
385 const uint iOut=IM( slice_id, active_size )+fiber_id;
386 index[ iOut ]=s_index[ threadIdx.x ];
387 }
388 }
389 }