4#include "SharedMemory.h"
9#define REDUCTION_BLOCK 128
12 Reduction<T>::Reduction()
21 Reduction<T>::Reduction(const uint num)
25 allocAuxiliaryArray(m_num);
29 Reduction<T>::~Reduction()
36 Reduction<T>* Reduction<T>::Create(const uint n)
38 return new Reduction<T>(n);
43 uint Reduction<T>::getAuxiliaryArraySize(const uint n)
45 return (n / REDUCTION_BLOCK + 1) + (n / (REDUCTION_BLOCK*REDUCTION_BLOCK) + REDUCTION_BLOCK);
49 * \brief Reduction using maximum of float values in shared memory for a warp.
54 __device__ void KerReduceWarp(volatile T* pData, unsigned tid, Function func)
56 if (blockSize >= 64)pData[tid] = func(pData[tid], pData[tid + 32]);
57 if (blockSize >= 32)pData[tid] = func(pData[tid], pData[tid + 16]);
58 if (blockSize >= 16)pData[tid] = func(pData[tid], pData[tid + 8]);
59 if (blockSize >= 8)pData[tid] = func(pData[tid], pData[tid + 4]);
60 if (blockSize >= 4)pData[tid] = func(pData[tid], pData[tid + 2]);
61 if (blockSize >= 2)pData[tid] = func(pData[tid], pData[tid + 1]);
65 * \brief Accumulates the sum of n values of array pData[],
66 * storing the result in the beginning of res[].
67 * (Many positions of res[] are used as blocks, storing the final result in res[0]).
72 __global__ void KerReduce(const T *pData, uint n, T *pAux, Function func, T val)
74 //extern __shared__ T sharedMem[];
77 T* sharedMem = smem.getPointer();
79 uint tid = threadIdx.x;
80 uint id = blockIdx.x*blockDim.x + threadIdx.x;
81 sharedMem[tid] = (id < n ? pData[id] : val);
83 if (blockSize >= 512) { if (tid < 256)sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 256]); __syncthreads(); }
84 if (blockSize >= 256) { if (tid < 128)sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 128]); __syncthreads(); }
85 if (blockSize >= 128) { if (tid < 64) sharedMem[tid] = func(sharedMem[tid], sharedMem[tid + 64]); __syncthreads(); }
86 if (tid < 32)KerReduceWarp<T, blockSize>(sharedMem, tid, func);
87 if (tid == 0)pAux[blockIdx.x] = sharedMem[0];
90 template<typename T, typename Function>
91 T Reduce(const T* pData, uint num, T* pAux, Function func, T v0)
94 uint sharedMemSize = REDUCTION_BLOCK * sizeof(T);
95 uint blockNum = cudaGridSize(num, REDUCTION_BLOCK);
96 const T* subData = pData;
98 T* aux2 = pAux + blockNum;
101 KerReduce<T, REDUCTION_BLOCK, Function> << <blockNum, REDUCTION_BLOCK, sharedMemSize >> > (subData, n, subAux, func, v0);
103 blockNum = cudaGridSize(n, REDUCTION_BLOCK);
105 subData = subAux; subAux = (subData == aux1 ? aux2 : aux1);
111 cudaMemcpyAsync(&val, subAux, sizeof(T), cudaMemcpyDeviceToHost);
113 cudaMemcpyAsync(&val, pData, sizeof(T), cudaMemcpyDeviceToHost);
119 T Reduction<T>::accumulate(const T* val, const uint num)
122 allocAuxiliaryArray(num);
124 return Reduce(val, num, m_aux, PlusFunc<T>(), (T)0);
128 T Reduction<T>::maximum(const T* val, const uint num)
131 allocAuxiliaryArray(num);
133 return Reduce(val, num, m_aux, MaximumFunc<T>(), -(T)REAL_MAX);
137 T Reduction<T>::minimum(const T* val, const uint num)
140 allocAuxiliaryArray(num);
142 return Reduce(val, num, m_aux, MinimumFunc<T>(), (T)REAL_MAX);
146 T Reduction<T>::average(const T* val, const uint num)
149 allocAuxiliaryArray(num);
151 return Reduce(val, num, m_aux, PlusFunc<T>(), (T)0) / num;
155 void Reduction<T>::allocAuxiliaryArray(const uint num)
157 if (m_aux != nullptr)
164 m_auxNum = getAuxiliaryArraySize(num);
165 cudaMalloc((void**)&m_aux, m_auxNum * sizeof(T));
168 template class Reduction<int>;
169 template class Reduction<float>;
170 template class Reduction<double>;
171 template class Reduction<uint>;
173 Reduction<Vec3f>::Reduction()
180 Reduction<Vec3f>::~Reduction()
182 if (m_aux != nullptr)
186 __global__ void R_SetupComponent(float* comp, const Vec3f* raw, size_t num, size_t comp_id)
188 int tId = threadIdx.x + (blockIdx.x * blockDim.x);
189 if (tId >= num) return;
191 comp[tId] = raw[tId][comp_id];
194 Vec3f Reduction<Vec3f>::accumulate(const Vec3f * val, const uint num)
199 allocAuxiliaryArray(num);
208 ret[0] = m_reduce_float.accumulate(m_aux, num);
217 ret[1] = m_reduce_float.accumulate(m_aux, num);
226 ret[2] = m_reduce_float.accumulate(m_aux, num);
232 Vec3f Reduction<Vec3f>::maximum(const Vec3f* val, const uint num)
237 allocAuxiliaryArray(num);
246 ret[0] = m_reduce_float.maximum(m_aux, num);
255 ret[1] = m_reduce_float.maximum(m_aux, num);
264 ret[2] = m_reduce_float.maximum(m_aux, num);
269 Vec3f Reduction<Vec3f>::minimum(const Vec3f* val, const uint num)
274 allocAuxiliaryArray(num);
283 ret[0] = m_reduce_float.minimum(m_aux, num);
292 ret[1] = m_reduce_float.minimum(m_aux, num);
301 ret[2] = m_reduce_float.minimum(m_aux, num);
307 Vec3f Reduction<Vec3f>::average(const Vec3f* val, const uint num)
312 allocAuxiliaryArray(num);
321 ret[0] = m_reduce_float.average(m_aux, num);
330 ret[1] = m_reduce_float.average(m_aux, num);
339 ret[2] = m_reduce_float.average(m_aux, num);
345 void Reduction<Vec3f>::allocAuxiliaryArray(const uint num)
347 if (m_aux != nullptr)
353 cudaMalloc((void**)&m_aux, m_num * sizeof(float));
357 Reduction<Vec3d>::Reduction()
364 Reduction<Vec3d>::~Reduction()
366 if (m_aux != nullptr)
370 __global__ void R_SetupComponent(double* comp, const Vec3d* raw, size_t num, size_t comp_id)
372 int tId = threadIdx.x + (blockIdx.x * blockDim.x);
373 if (tId >= num) return;
375 comp[tId] = raw[tId][comp_id];
378 Vec3d Reduction<Vec3d>::accumulate(const Vec3d * val, uint num)
383 allocAuxiliaryArray(num);
392 ret[0] = m_reduce_double.accumulate(m_aux, num);
401 ret[1] = m_reduce_double.accumulate(m_aux, num);
410 ret[2] = m_reduce_double.accumulate(m_aux, num);
416 Vec3d Reduction<Vec3d>::maximum(const Vec3d* val, const uint num)
421 allocAuxiliaryArray(num);
430 ret[0] = m_reduce_double.maximum(m_aux, num);
439 ret[1] = m_reduce_double.maximum(m_aux, num);
448 ret[2] = m_reduce_double.maximum(m_aux, num);
453 Vec3d Reduction<Vec3d>::minimum(const Vec3d* val, const uint num)
464 ret[0] = m_reduce_double.minimum(m_aux, num);
473 ret[1] = m_reduce_double.minimum(m_aux, num);
482 ret[2] = m_reduce_double.minimum(m_aux, num);
488 Vec3d Reduction<Vec3d>::average(const Vec3d* val, const uint num)
499 ret[0] = m_reduce_double.average(m_aux, num);
508 ret[1] = m_reduce_double.average(m_aux, num);
517 ret[2] = m_reduce_double.average(m_aux, num);
523 void Reduction<Vec3d>::allocAuxiliaryArray(const uint num)
525 if (m_aux != nullptr)
531 cudaMalloc((void**)&m_aux, m_num * sizeof(double));