PeriDyno 1.0.0
Loading...
Searching...
No Matches
Reduction.cu
Go to the documentation of this file.
1#include "Reduction.h"
2#include <cassert>
3#include <cfloat>
4#include "SharedMemory.h"
5#include "Functional.h"
6
7namespace dyno {
8
9#define REDUCTION_BLOCK 128
10
11 template<typename T>
12 Reduction<T>::Reduction()
13 : m_num(0)
14 , m_aux(NULL)
15 {
16
17 }
18
19
20 template<typename T>
21 Reduction<T>::Reduction(const uint num)
22 : m_num(num)
23 , m_aux(NULL)
24 {
25 allocAuxiliaryArray(m_num);
26 }
27
28 template<typename T>
29 Reduction<T>::~Reduction()
30 {
31 if(m_aux != nullptr)
32 cudaFree(m_aux);
33 }
34
35 template<typename T>
36 Reduction<T>* Reduction<T>::Create(const uint n)
37 {
38 return new Reduction<T>(n);
39 }
40
41
42 template<typename T>
43 uint Reduction<T>::getAuxiliaryArraySize(const uint n)
44 {
45 return (n / REDUCTION_BLOCK + 1) + (n / (REDUCTION_BLOCK*REDUCTION_BLOCK) + REDUCTION_BLOCK);
46 }
47
48 /*!
49 * \brief Reduction using maximum of float values in shared memory for a warp.
50 */
51 template <typename T,
52 unsigned blockSize,
53 typename Function>
54 __device__ void KerReduceWarp(volatile T* pData, unsigned tid, Function func)
55 {
56 if (blockSize >= 64)pData[tid] = func(pData[tid], pData[tid + 32]);
57 if (blockSize >= 32)pData[tid] = func(pData[tid], pData[tid + 16]);
58 if (blockSize >= 16)pData[tid] = func(pData[tid], pData[tid + 8]);
59 if (blockSize >= 8)pData[tid] = func(pData[tid], pData[tid + 4]);
60 if (blockSize >= 4)pData[tid] = func(pData[tid], pData[tid + 2]);
61 if (blockSize >= 2)pData[tid] = func(pData[tid], pData[tid + 1]);
62 }
63
64 /*!
65 * \brief Accumulates the sum of n values of array pData[],
66 * storing the result in the beginning of res[].
67 * (Many positions of res[] are used as blocks, storing the final result in res[0]).
68 */
69 template <typename T,
70 unsigned blockSize,
71 typename Function>
72 __global__ void KerReduce(const T *pData, uint n, T *pAux, Function func, T val)
73 {
74 //extern __shared__ T sharedMem[];
75
76 SharedMemory<T> smem;
77 T* sharedMem = smem.getPointer();
78
79 uint tid = threadIdx.x;
80 uint id = blockIdx.x*blockDim.x + threadIdx.x;
81 sharedMem[tid] = (id < n ? pData[id] : val);
82 __syncthreads();
83 if (blockSize >= 512) { if (tid < 256)sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 256]); __syncthreads(); }
84 if (blockSize >= 256) { if (tid < 128)sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 128]); __syncthreads(); }
85 if (blockSize >= 128) { if (tid < 64) sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 64]); __syncthreads(); }
86 if (tid < 32)KerReduceWarp<T, blockSize>(sharedMem, tid, func);
87 if (tid == 0)pAux[blockIdx.x] = sharedMem[0];
88 }
89
90 template<typename T, typename Function>
91 T Reduce(const T* pData, uint num, T* pAux, Function func, T v0)
92 {
93 uint n = num;
94 uint sharedMemSize = REDUCTION_BLOCK * sizeof(T);
95 uint blockNum = cudaGridSize(num, REDUCTION_BLOCK);
96 const T* subData = pData;
97 T* aux1 = pAux;
98 T* aux2 = pAux + blockNum;
99 T* subAux = aux1;
100 while (n > 1) {
101 KerReduce<T, REDUCTION_BLOCK, Function> << <blockNum, REDUCTION_BLOCK, sharedMemSize >> > (subData, n, subAux, func, v0);
102 n = blockNum;
103 blockNum = cudaGridSize(n, REDUCTION_BLOCK);
104 if (n > 1) {
105 subData = subAux; subAux = (subData == aux1 ? aux2 : aux1);
106 }
107 }
108
109 T val;
110 if (num > 1)
111 cudaMemcpyAsync(&val, subAux, sizeof(T), cudaMemcpyDeviceToHost);
112 else
113 cudaMemcpyAsync(&val, pData, sizeof(T), cudaMemcpyDeviceToHost);
114
115 return val;
116 }
117
118 template<typename T>
119 T Reduction<T>::accumulate(const T* val, const uint num)
120 {
121 if (num != m_num)
122 allocAuxiliaryArray(num);
123
124 return Reduce(val, num, m_aux, PlusFunc<T>(), (T)0);
125 }
126
127 template<typename T>
128 T Reduction<T>::maximum(const T* val, const uint num)
129 {
130 if (num != m_num)
131 allocAuxiliaryArray(num);
132
133 return Reduce(val, num, m_aux, MaximumFunc<T>(), -(T)REAL_MAX);
134 }
135
136 template<typename T>
137 T Reduction<T>::minimum(const T* val, const uint num)
138 {
139 if (num != m_num)
140 allocAuxiliaryArray(num);
141
142 return Reduce(val, num, m_aux, MinimumFunc<T>(), (T)REAL_MAX);
143 }
144
145 template<typename T>
146 T Reduction<T>::average(const T* val, const uint num)
147 {
148 if (num != m_num)
149 allocAuxiliaryArray(num);
150
151 return Reduce(val, num, m_aux, PlusFunc<T>(), (T)0) / num;
152 }
153
154 template<typename T>
155 void Reduction<T>::allocAuxiliaryArray(const uint num)
156 {
157 if (m_aux != nullptr)
158 {
159 cudaFree(m_aux);
160 }
161
162 m_num = num;
163
164 m_auxNum = getAuxiliaryArraySize(num);
165 cudaMalloc((void**)&m_aux, m_auxNum * sizeof(T));
166 }
167
168 template class Reduction<int>;
169 template class Reduction<float>;
170 template class Reduction<double>;
171 template class Reduction<uint>;
172
173 Reduction<Vec3f>::Reduction()
174 : m_num(0)
175 , m_aux(NULL)
176 {
177
178 }
179
180 Reduction<Vec3f>::~Reduction()
181 {
182 if (m_aux != nullptr)
183 cudaFree(m_aux);
184 }
185
186 __global__ void R_SetupComponent(float* comp, const Vec3f* raw, size_t num, size_t comp_id)
187 {
188 int tId = threadIdx.x + (blockIdx.x * blockDim.x);
189 if (tId >= num) return;
190
191 comp[tId] = raw[tId][comp_id];
192 }
193
194 Vec3f Reduction<Vec3f>::accumulate(const Vec3f * val, const uint num)
195 {
196 Vec3f ret;
197
198 if (num != m_num)
199 allocAuxiliaryArray(num);
200
201 cuExecute(num,
202 R_SetupComponent,
203 m_aux,
204 val,
205 num,
206 0);
207
208 ret[0] = m_reduce_float.accumulate(m_aux, num);
209
210 cuExecute(num,
211 R_SetupComponent,
212 m_aux,
213 val,
214 num,
215 1);
216
217 ret[1] = m_reduce_float.accumulate(m_aux, num);
218
219 cuExecute(num,
220 R_SetupComponent,
221 m_aux,
222 val,
223 num,
224 2);
225
226 ret[2] = m_reduce_float.accumulate(m_aux, num);
227
228 return ret;
229 }
230
231
232 Vec3f Reduction<Vec3f>::maximum(const Vec3f* val, const uint num)
233 {
234 Vec3f ret;
235
236 if (num != m_num)
237 allocAuxiliaryArray(num);
238
239 cuExecute(num,
240 R_SetupComponent,
241 m_aux,
242 val,
243 num,
244 0);
245
246 ret[0] = m_reduce_float.maximum(m_aux, num);
247
248 cuExecute(num,
249 R_SetupComponent,
250 m_aux,
251 val,
252 num,
253 1);
254
255 ret[1] = m_reduce_float.maximum(m_aux, num);
256
257 cuExecute(num,
258 R_SetupComponent,
259 m_aux,
260 val,
261 num,
262 2);
263
264 ret[2] = m_reduce_float.maximum(m_aux, num);
265
266 return ret;
267 }
268
269 Vec3f Reduction<Vec3f>::minimum(const Vec3f* val, const uint num)
270 {
271 Vec3f ret;
272
273 if (num != m_num)
274 allocAuxiliaryArray(num);
275
276 cuExecute(num,
277 R_SetupComponent,
278 m_aux,
279 val,
280 num,
281 0);
282
283 ret[0] = m_reduce_float.minimum(m_aux, num);
284
285 cuExecute(num,
286 R_SetupComponent,
287 m_aux,
288 val,
289 num,
290 1);
291
292 ret[1] = m_reduce_float.minimum(m_aux, num);
293
294 cuExecute(num,
295 R_SetupComponent,
296 m_aux,
297 val,
298 num,
299 2);
300
301 ret[2] = m_reduce_float.minimum(m_aux, num);
302
303 return ret;
304 }
305
306
307 Vec3f Reduction<Vec3f>::average(const Vec3f* val, const uint num)
308 {
309 Vec3f ret;
310
311 if (num != m_num)
312 allocAuxiliaryArray(num);
313
314 cuExecute(num,
315 R_SetupComponent,
316 m_aux,
317 val,
318 num,
319 0);
320
321 ret[0] = m_reduce_float.average(m_aux, num);
322
323 cuExecute(num,
324 R_SetupComponent,
325 m_aux,
326 val,
327 num,
328 1);
329
330 ret[1] = m_reduce_float.average(m_aux, num);
331
332 cuExecute(num,
333 R_SetupComponent,
334 m_aux,
335 val,
336 num,
337 2);
338
339 ret[2] = m_reduce_float.average(m_aux, num);
340
341
342 return ret;
343 }
344
345 void Reduction<Vec3f>::allocAuxiliaryArray(const uint num)
346 {
347 if (m_aux != nullptr)
348 {
349 cudaFree(m_aux);
350 }
351
352 m_num = num;
353 cudaMalloc((void**)&m_aux, m_num * sizeof(float));
354 }
355
356
357 Reduction<Vec3d>::Reduction()
358 : m_num(0)
359 , m_aux(NULL)
360 {
361
362 }
363
364 Reduction<Vec3d>::~Reduction()
365 {
366 if (m_aux != nullptr)
367 cudaFree(m_aux);
368 }
369
370 __global__ void R_SetupComponent(double* comp, const Vec3d* raw, size_t num, size_t comp_id)
371 {
372 int tId = threadIdx.x + (blockIdx.x * blockDim.x);
373 if (tId >= num) return;
374
375 comp[tId] = raw[tId][comp_id];
376 }
377
378 Vec3d Reduction<Vec3d>::accumulate(const Vec3d * val, uint num)
379 {
380 Vec3d ret;
381
382 if (num != m_num)
383 allocAuxiliaryArray(num);
384
385 cuExecute(num,
386 R_SetupComponent,
387 m_aux,
388 val,
389 num,
390 0);
391
392 ret[0] = m_reduce_double.accumulate(m_aux, num);
393
394 cuExecute(num,
395 R_SetupComponent,
396 m_aux,
397 val,
398 num,
399 1);
400
401 ret[1] = m_reduce_double.accumulate(m_aux, num);
402
403 cuExecute(num,
404 R_SetupComponent,
405 m_aux,
406 val,
407 num,
408 2);
409
410 ret[2] = m_reduce_double.accumulate(m_aux, num);
411
412 return ret;
413 }
414
415
416 Vec3d Reduction<Vec3d>::maximum(const Vec3d* val, const uint num)
417 {
418 Vec3d ret;
419
420 if (num != m_num)
421 allocAuxiliaryArray(num);
422
423 cuExecute(num,
424 R_SetupComponent,
425 m_aux,
426 val,
427 num,
428 0);
429
430 ret[0] = m_reduce_double.maximum(m_aux, num);
431
432 cuExecute(num,
433 R_SetupComponent,
434 m_aux,
435 val,
436 num,
437 1);
438
439 ret[1] = m_reduce_double.maximum(m_aux, num);
440
441 cuExecute(num,
442 R_SetupComponent,
443 m_aux,
444 val,
445 num,
446 2);
447
448 ret[2] = m_reduce_double.maximum(m_aux, num);
449
450 return ret;
451 }
452
453 Vec3d Reduction<Vec3d>::minimum(const Vec3d* val, const uint num)
454 {
455 Vec3d ret;
456
457 cuExecute(num,
458 R_SetupComponent,
459 m_aux,
460 val,
461 num,
462 0);
463
464 ret[0] = m_reduce_double.minimum(m_aux, num);
465
466 cuExecute(num,
467 R_SetupComponent,
468 m_aux,
469 val,
470 num,
471 1);
472
473 ret[1] = m_reduce_double.minimum(m_aux, num);
474
475 cuExecute(num,
476 R_SetupComponent,
477 m_aux,
478 val,
479 num,
480 2);
481
482 ret[2] = m_reduce_double.minimum(m_aux, num);
483
484 return ret;
485 }
486
487
488 Vec3d Reduction<Vec3d>::average(const Vec3d* val, const uint num)
489 {
490 Vec3d ret;
491
492 cuExecute(num,
493 R_SetupComponent,
494 m_aux,
495 val,
496 num,
497 0);
498
499 ret[0] = m_reduce_double.average(m_aux, num);
500
501 cuExecute(num,
502 R_SetupComponent,
503 m_aux,
504 val,
505 num,
506 1);
507
508 ret[1] = m_reduce_double.average(m_aux, num);
509
510 cuExecute(num,
511 R_SetupComponent,
512 m_aux,
513 val,
514 num,
515 2);
516
517 ret[2] = m_reduce_double.average(m_aux, num);
518
519
520 return ret;
521 }
522
523 void Reduction<Vec3d>::allocAuxiliaryArray(const uint num)
524 {
525 if (m_aux != nullptr)
526 {
527 cudaFree(m_aux);
528 }
529
530 m_num = num;
531 cudaMalloc((void**)&m_aux, m_num * sizeof(double));
532 }
533}