2#include "SharedMemory.h"
6#define SCAN_SHARED_MEMORY_BANKS 32
7#define SCAN_LOG_MEM_BANKS 5
8#define SCAN_CONFLICT_FREE_OFFSET(n) ((n) >> SCAN_LOG_MEM_BANKS)
10 uint SCAN_THREADS_PER_BLOCK = 512;
11 uint SCAN_ELEMENTS_PER_BLOCK = SCAN_THREADS_PER_BLOCK * 2;
23 for (int i = 0; i < SCAN_LEVEL; i++)
31 void Scan<T>::exclusive(DArray<T>& output, DArray<T>& input, bool bcao)
33 assert(input.size() == output.size());
35 if (input.size() > SCAN_ELEMENTS_PER_BLOCK) {
36 scanLargeDeviceArray(output.begin(), input.begin(), input.size(), bcao, 0);
39 scanSmallDeviceArray(output.begin(), input.begin(), input.size(), bcao);
44 void Scan<T>::exclusive(DArray<T>& data, bool bcao /*= true*/)
46 if (m_buffer.size() != data.size())
48 m_buffer.resize(data.size());
51 m_buffer.assign(data);
52 this->exclusive(data, m_buffer, bcao);
56 void Scan<T>::exclusive(T* data, size_t length, bool bcao /*= true*/)
58 if (m_buffer.size() != length)
60 m_buffer.resize(length);
63 cudaMemcpy(m_buffer.begin(), data, length*sizeof(T), cudaMemcpyDeviceToDevice);
65 this->exclusive(data, m_buffer.begin(), length, bcao);
69 void Scan<T>::exclusive(T* output, const T* input, size_t length, bool bcao /*= true*/)
71 if (length > SCAN_ELEMENTS_PER_BLOCK) {
72 scanLargeDeviceArray(output, input, length, bcao, 0);
75 scanSmallDeviceArray(output, input, length, bcao);
80 __global__ void k_prescan_arbitrary(T *output, const T *input, size_t n, int powerOfTwo)
82 //extern __shared__ T temp[];// allocated on invocation
83 SharedMemory<T> shared_mem;
84 T* temp = shared_mem.getPointer();
86 int threadID = threadIdx.x;
89 int bi = threadID + (n / 2);
90 int bankOffsetA = SCAN_CONFLICT_FREE_OFFSET(ai);
91 int bankOffsetB = SCAN_CONFLICT_FREE_OFFSET(bi);
95 temp[ai + bankOffsetA] = input[ai];
96 temp[bi + bankOffsetB] = input[bi];
99 temp[ai + bankOffsetA] = 0;
100 temp[bi + bankOffsetB] = 0;
105 for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
110 int ai = offset * (2 * threadID + 1) - 1;
111 int bi = offset * (2 * threadID + 2) - 1;
112 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
113 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
115 temp[bi] += temp[ai];
121 temp[powerOfTwo - 1 + SCAN_CONFLICT_FREE_OFFSET(powerOfTwo - 1)] = 0; // clear the last element
124 for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
130 int ai = offset * (2 * threadID + 1) - 1;
131 int bi = offset * (2 * threadID + 2) - 1;
132 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
133 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
143 output[ai] = temp[ai + bankOffsetA];
144 output[bi] = temp[bi + bankOffsetB];
149 __global__ void k_prescan_arbitrary_unoptimized(T *output, const T *input, size_t n, int powerOfTwo) {
150 //extern __shared__ T temp[];// allocated on invocation
151 SharedMemory<T> shared_mem;
152 T* temp = shared_mem.getPointer();
154 int threadID = threadIdx.x;
157 temp[2 * threadID] = input[2 * threadID]; // load input into shared memory
158 temp[2 * threadID + 1] = input[2 * threadID + 1];
161 temp[2 * threadID] = 0;
162 temp[2 * threadID + 1] = 0;
167 for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
172 int ai = offset * (2 * threadID + 1) - 1;
173 int bi = offset * (2 * threadID + 2) - 1;
174 temp[bi] += temp[ai];
179 if (threadID == 0) { temp[powerOfTwo - 1] = 0; } // clear the last element
181 for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
187 int ai = offset * (2 * threadID + 1) - 1;
188 int bi = offset * (2 * threadID + 2) - 1;
197 output[2 * threadID] = temp[2 * threadID]; // write results to device memory
198 output[2 * threadID + 1] = temp[2 * threadID + 1];
203 __global__ void k_add(T *output, size_t length, const T *n) {
204 int blockID = blockIdx.x;
205 int threadID = threadIdx.x;
206 int blockOffset = blockID * length;
208 output[blockOffset + threadID] += n[blockID];
212 __global__ void k_add(T *output, size_t length, const T *n1, const T *n2) {
213 int blockID = blockIdx.x;
214 int threadID = threadIdx.x;
215 int blockOffset = blockID * length;
217 output[blockOffset + threadID] += n1[blockID] + n2[blockID];
221 void Scan<T>::scanLargeDeviceArray(T *d_out, const T *d_in, size_t length, bool bcao, size_t level)
223 size_t remainder = length % (SCAN_ELEMENTS_PER_BLOCK);
224 if (remainder == 0) {
225 scanLargeEvenDeviceArray(d_out, d_in, length, bcao, level);
228 // perform a large scan on a compatible multiple of elements
229 size_t lengthMultiple = length - remainder;
230 scanLargeEvenDeviceArray(d_out, d_in, lengthMultiple, bcao, level);
232 // scan the remaining elements and add the (inclusive) last element of the large scan to this
233 T *startOfOutputArray = &(d_out[lengthMultiple]);
234 scanSmallDeviceArray(startOfOutputArray, &(d_in[lengthMultiple]), remainder, bcao);
236 k_add << <1, (unsigned int)remainder >> > (startOfOutputArray, remainder, &(d_in[lengthMultiple - 1]), &(d_out[lengthMultiple - 1]));
242 void Scan<T>::scanSmallDeviceArray(T *d_out, const T *d_in, size_t length, bool bcao)
244 size_t powerOfTwo = nextPowerOfTwo(length);
247 k_prescan_arbitrary << <1, (unsigned int)(length + 1) / 2, (unsigned int)2 * powerOfTwo * sizeof(T) >> > (d_out, d_in, length, powerOfTwo);
251 k_prescan_arbitrary_unoptimized << <1, (unsigned int)(length + 1) / 2, (unsigned int)2 * powerOfTwo * sizeof(T) >> > (d_out, d_in, length, powerOfTwo);
257 __global__ void k_prescan_large(T *output, const T *input, int n, T *sums) {
258 //extern __shared__ T temp[];
259 SharedMemory<T> shared_mem;
260 T* temp = shared_mem.getPointer();
262 int blockID = blockIdx.x;
263 int threadID = threadIdx.x;
264 int blockOffset = blockID * n;
267 int bi = threadID + (n / 2);
268 int bankOffsetA = SCAN_CONFLICT_FREE_OFFSET(ai);
269 int bankOffsetB = SCAN_CONFLICT_FREE_OFFSET(bi);
270 temp[ai + bankOffsetA] = input[blockOffset + ai];
271 temp[bi + bankOffsetB] = input[blockOffset + bi];
274 for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
279 int ai = offset * (2 * threadID + 1) - 1;
280 int bi = offset * (2 * threadID + 2) - 1;
281 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
282 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
284 temp[bi] += temp[ai];
292 sums[blockID] = temp[n - 1 + SCAN_CONFLICT_FREE_OFFSET(n - 1)];
293 temp[n - 1 + SCAN_CONFLICT_FREE_OFFSET(n - 1)] = 0;
296 for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
302 int ai = offset * (2 * threadID + 1) - 1;
303 int bi = offset * (2 * threadID + 2) - 1;
304 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
305 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
314 output[blockOffset + ai] = temp[ai + bankOffsetA];
315 output[blockOffset + bi] = temp[bi + bankOffsetB];
319 __global__ void k_prescan_large_unoptimized(T *output, const T *input, int n, T *sums) {
320 int blockID = blockIdx.x;
321 int threadID = threadIdx.x;
322 int blockOffset = blockID * n;
324 //extern __shared__ T temp[];
325 SharedMemory<T> shared_mem;
326 T* temp = shared_mem.getPointer();
328 temp[2 * threadID] = input[blockOffset + (2 * threadID)];
329 temp[2 * threadID + 1] = input[blockOffset + (2 * threadID) + 1];
332 for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
337 int ai = offset * (2 * threadID + 1) - 1;
338 int bi = offset * (2 * threadID + 2) - 1;
339 temp[bi] += temp[ai];
347 sums[blockID] = temp[n - 1];
351 for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
357 int ai = offset * (2 * threadID + 1) - 1;
358 int bi = offset * (2 * threadID + 2) - 1;
366 output[blockOffset + (2 * threadID)] = temp[2 * threadID];
367 output[blockOffset + (2 * threadID) + 1] = temp[2 * threadID + 1];
371 void Scan<T>::scanLargeEvenDeviceArray(T *output, const T *input, size_t length, bool bcao, size_t level)
373 const int blocks = length / SCAN_ELEMENTS_PER_BLOCK;
374 const int sharedMemArraySize = SCAN_ELEMENTS_PER_BLOCK * sizeof(T);
376 //The following code is used to avoid malloc GPU memory for each call
377 if (level < SCAN_LEVEL)
379 const int blocks = length / SCAN_ELEMENTS_PER_BLOCK;
380 const int sharedMemArraySize = SCAN_ELEMENTS_PER_BLOCK * sizeof(T);
382 if (m_sums[level].size() != blocks)
384 m_sums[level].resize(blocks);
385 m_incr[level].resize(blocks);
389 k_prescan_large << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, m_sums[level].begin());
393 k_prescan_large_unoptimized << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, m_sums[level].begin());
397 const int sumsArrThreadsNeeded = (blocks + 1) / 2;
398 if (sumsArrThreadsNeeded > SCAN_THREADS_PER_BLOCK) {
399 // perform a large scan on the sums arr
400 scanLargeDeviceArray(m_incr[level].begin(), m_sums[level].begin(), blocks, bcao, level+1);
403 // only need one block to scan sums arr so can use small scan
404 scanSmallDeviceArray(m_incr[level].begin(), m_sums[level].begin(), blocks, bcao);
407 k_add << <blocks, SCAN_ELEMENTS_PER_BLOCK >> > (output, SCAN_ELEMENTS_PER_BLOCK, m_incr[level].begin());
413 cudaMalloc((void **)&d_sums, blocks * sizeof(T));
414 cudaMalloc((void **)&d_incr, blocks * sizeof(T));
417 k_prescan_large << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, d_sums);
421 k_prescan_large_unoptimized << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, d_sums);
425 const int sumsArrThreadsNeeded = (blocks + 1) / 2;
426 if (sumsArrThreadsNeeded > SCAN_THREADS_PER_BLOCK) {
427 // perform a large scan on the sums arr
428 scanLargeDeviceArray(d_incr, d_sums, blocks, bcao, level + 1);
431 // only need one block to scan sums arr so can use small scan
432 scanSmallDeviceArray(d_incr, d_sums, blocks, bcao);
435 k_add << <blocks, SCAN_ELEMENTS_PER_BLOCK >> > (output, SCAN_ELEMENTS_PER_BLOCK, d_incr);
444 bool Scan<T>::isPowerOfTwo(size_t x)
446 return x && !(x & (x - 1));
450 size_t Scan<T>::nextPowerOfTwo(size_t x)
459 template class Scan<int>;
460 template class Scan<uint>;