PeriDyno 1.0.0
Loading...
Searching...
No Matches
Scan.cu
Go to the documentation of this file.
1#include "Scan.h"
2#include "SharedMemory.h"
3
4namespace dyno
5{
6#define SCAN_SHARED_MEMORY_BANKS 32
7#define SCAN_LOG_MEM_BANKS 5
8#define SCAN_CONFLICT_FREE_OFFSET(n) ((n) >> SCAN_LOG_MEM_BANKS)
9
10 uint SCAN_THREADS_PER_BLOCK = 512;
11 uint SCAN_ELEMENTS_PER_BLOCK = SCAN_THREADS_PER_BLOCK * 2;
12
13 template<typename T>
14 Scan<T>::Scan()
15 {
16 }
17
18 template<typename T>
19 Scan<T>::~Scan()
20 {
21 m_buffer.clear();
22
23 for (int i = 0; i < SCAN_LEVEL; i++)
24 {
25 m_sums[i].clear();
26 m_incr[i].clear();
27 }
28 }
29
30 template<typename T>
31 void Scan<T>::exclusive(DArray<T>& output, DArray<T>& input, bool bcao)
32 {
33 assert(input.size() == output.size());
34
35 if (input.size() > SCAN_ELEMENTS_PER_BLOCK) {
36 scanLargeDeviceArray(output.begin(), input.begin(), input.size(), bcao, 0);
37 }
38 else {
39 scanSmallDeviceArray(output.begin(), input.begin(), input.size(), bcao);
40 }
41 }
42
43 template<typename T>
44 void Scan<T>::exclusive(DArray<T>& data, bool bcao /*= true*/)
45 {
46 if (m_buffer.size() != data.size())
47 {
48 m_buffer.resize(data.size());
49 }
50
51 m_buffer.assign(data);
52 this->exclusive(data, m_buffer, bcao);
53 }
54
55 template<typename T>
56 void Scan<T>::exclusive(T* data, size_t length, bool bcao /*= true*/)
57 {
58 if (m_buffer.size() != length)
59 {
60 m_buffer.resize(length);
61 }
62
63 cudaMemcpy(m_buffer.begin(), data, length*sizeof(T), cudaMemcpyDeviceToDevice);
64
65 this->exclusive(data, m_buffer.begin(), length, bcao);
66 }
67
68 template<typename T>
69 void Scan<T>::exclusive(T* output, const T* input, size_t length, bool bcao /*= true*/)
70 {
71 if (length > SCAN_ELEMENTS_PER_BLOCK) {
72 scanLargeDeviceArray(output, input, length, bcao, 0);
73 }
74 else {
75 scanSmallDeviceArray(output, input, length, bcao);
76 }
77 }
78
79 template<typename T>
80 __global__ void k_prescan_arbitrary(T *output, const T *input, size_t n, int powerOfTwo)
81 {
82 //extern __shared__ T temp[];// allocated on invocation
83 SharedMemory<T> shared_mem;
84 T* temp = shared_mem.getPointer();
85
86 int threadID = threadIdx.x;
87
88 int ai = threadID;
89 int bi = threadID + (n / 2);
90 int bankOffsetA = SCAN_CONFLICT_FREE_OFFSET(ai);
91 int bankOffsetB = SCAN_CONFLICT_FREE_OFFSET(bi);
92
93
94 if (threadID < n) {
95 temp[ai + bankOffsetA] = input[ai];
96 temp[bi + bankOffsetB] = input[bi];
97 }
98 else {
99 temp[ai + bankOffsetA] = 0;
100 temp[bi + bankOffsetB] = 0;
101 }
102
103
104 int offset = 1;
105 for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
106 {
107 __syncthreads();
108 if (threadID < d)
109 {
110 int ai = offset * (2 * threadID + 1) - 1;
111 int bi = offset * (2 * threadID + 2) - 1;
112 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
113 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
114
115 temp[bi] += temp[ai];
116 }
117 offset *= 2;
118 }
119
120 if (threadID == 0) {
121 temp[powerOfTwo - 1 + SCAN_CONFLICT_FREE_OFFSET(powerOfTwo - 1)] = 0; // clear the last element
122 }
123
124 for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
125 {
126 offset >>= 1;
127 __syncthreads();
128 if (threadID < d)
129 {
130 int ai = offset * (2 * threadID + 1) - 1;
131 int bi = offset * (2 * threadID + 2) - 1;
132 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
133 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
134
135 int t = temp[ai];
136 temp[ai] = temp[bi];
137 temp[bi] += t;
138 }
139 }
140 __syncthreads();
141
142 if (threadID < n) {
143 output[ai] = temp[ai + bankOffsetA];
144 output[bi] = temp[bi + bankOffsetB];
145 }
146 }
147
148 template<typename T>
149 __global__ void k_prescan_arbitrary_unoptimized(T *output, const T *input, size_t n, int powerOfTwo) {
150 //extern __shared__ T temp[];// allocated on invocation
151 SharedMemory<T> shared_mem;
152 T* temp = shared_mem.getPointer();
153
154 int threadID = threadIdx.x;
155
156 if (threadID < n) {
157 temp[2 * threadID] = input[2 * threadID]; // load input into shared memory
158 temp[2 * threadID + 1] = input[2 * threadID + 1];
159 }
160 else {
161 temp[2 * threadID] = 0;
162 temp[2 * threadID + 1] = 0;
163 }
164
165
166 int offset = 1;
167 for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
168 {
169 __syncthreads();
170 if (threadID < d)
171 {
172 int ai = offset * (2 * threadID + 1) - 1;
173 int bi = offset * (2 * threadID + 2) - 1;
174 temp[bi] += temp[ai];
175 }
176 offset *= 2;
177 }
178
179 if (threadID == 0) { temp[powerOfTwo - 1] = 0; } // clear the last element
180
181 for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
182 {
183 offset >>= 1;
184 __syncthreads();
185 if (threadID < d)
186 {
187 int ai = offset * (2 * threadID + 1) - 1;
188 int bi = offset * (2 * threadID + 2) - 1;
189 int t = temp[ai];
190 temp[ai] = temp[bi];
191 temp[bi] += t;
192 }
193 }
194 __syncthreads();
195
196 if (threadID < n) {
197 output[2 * threadID] = temp[2 * threadID]; // write results to device memory
198 output[2 * threadID + 1] = temp[2 * threadID + 1];
199 }
200 }
201
202 template<typename T>
203 __global__ void k_add(T *output, size_t length, const T *n) {
204 int blockID = blockIdx.x;
205 int threadID = threadIdx.x;
206 int blockOffset = blockID * length;
207
208 output[blockOffset + threadID] += n[blockID];
209 }
210
211 template<typename T>
212 __global__ void k_add(T *output, size_t length, const T *n1, const T *n2) {
213 int blockID = blockIdx.x;
214 int threadID = threadIdx.x;
215 int blockOffset = blockID * length;
216
217 output[blockOffset + threadID] += n1[blockID] + n2[blockID];
218 }
219
220 template<typename T>
221 void Scan<T>::scanLargeDeviceArray(T *d_out, const T *d_in, size_t length, bool bcao, size_t level)
222 {
223 size_t remainder = length % (SCAN_ELEMENTS_PER_BLOCK);
224 if (remainder == 0) {
225 scanLargeEvenDeviceArray(d_out, d_in, length, bcao, level);
226 }
227 else {
228 // perform a large scan on a compatible multiple of elements
229 size_t lengthMultiple = length - remainder;
230 scanLargeEvenDeviceArray(d_out, d_in, lengthMultiple, bcao, level);
231
232 // scan the remaining elements and add the (inclusive) last element of the large scan to this
233 T *startOfOutputArray = &(d_out[lengthMultiple]);
234 scanSmallDeviceArray(startOfOutputArray, &(d_in[lengthMultiple]), remainder, bcao);
235
236 k_add << <1, (unsigned int)remainder >> > (startOfOutputArray, remainder, &(d_in[lengthMultiple - 1]), &(d_out[lengthMultiple - 1]));
237 cuSynchronize();
238 }
239 }
240
241 template<typename T>
242 void Scan<T>::scanSmallDeviceArray(T *d_out, const T *d_in, size_t length, bool bcao)
243 {
244 size_t powerOfTwo = nextPowerOfTwo(length);
245
246 if (bcao) {
247 k_prescan_arbitrary << <1, (unsigned int)(length + 1) / 2, (unsigned int)2 * powerOfTwo * sizeof(T) >> > (d_out, d_in, length, powerOfTwo);
248 cuSynchronize();
249 }
250 else {
251 k_prescan_arbitrary_unoptimized << <1, (unsigned int)(length + 1) / 2, (unsigned int)2 * powerOfTwo * sizeof(T) >> > (d_out, d_in, length, powerOfTwo);
252 cuSynchronize();
253 }
254 }
255
256 template<typename T>
257 __global__ void k_prescan_large(T *output, const T *input, int n, T *sums) {
258 //extern __shared__ T temp[];
259 SharedMemory<T> shared_mem;
260 T* temp = shared_mem.getPointer();
261
262 int blockID = blockIdx.x;
263 int threadID = threadIdx.x;
264 int blockOffset = blockID * n;
265
266 int ai = threadID;
267 int bi = threadID + (n / 2);
268 int bankOffsetA = SCAN_CONFLICT_FREE_OFFSET(ai);
269 int bankOffsetB = SCAN_CONFLICT_FREE_OFFSET(bi);
270 temp[ai + bankOffsetA] = input[blockOffset + ai];
271 temp[bi + bankOffsetB] = input[blockOffset + bi];
272
273 int offset = 1;
274 for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
275 {
276 __syncthreads();
277 if (threadID < d)
278 {
279 int ai = offset * (2 * threadID + 1) - 1;
280 int bi = offset * (2 * threadID + 2) - 1;
281 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
282 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
283
284 temp[bi] += temp[ai];
285 }
286 offset *= 2;
287 }
288 __syncthreads();
289
290
291 if (threadID == 0) {
292 sums[blockID] = temp[n - 1 + SCAN_CONFLICT_FREE_OFFSET(n - 1)];
293 temp[n - 1 + SCAN_CONFLICT_FREE_OFFSET(n - 1)] = 0;
294 }
295
296 for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
297 {
298 offset >>= 1;
299 __syncthreads();
300 if (threadID < d)
301 {
302 int ai = offset * (2 * threadID + 1) - 1;
303 int bi = offset * (2 * threadID + 2) - 1;
304 ai += SCAN_CONFLICT_FREE_OFFSET(ai);
305 bi += SCAN_CONFLICT_FREE_OFFSET(bi);
306
307 int t = temp[ai];
308 temp[ai] = temp[bi];
309 temp[bi] += t;
310 }
311 }
312 __syncthreads();
313
314 output[blockOffset + ai] = temp[ai + bankOffsetA];
315 output[blockOffset + bi] = temp[bi + bankOffsetB];
316 }
317
318 template<typename T>
319 __global__ void k_prescan_large_unoptimized(T *output, const T *input, int n, T *sums) {
320 int blockID = blockIdx.x;
321 int threadID = threadIdx.x;
322 int blockOffset = blockID * n;
323
324 //extern __shared__ T temp[];
325 SharedMemory<T> shared_mem;
326 T* temp = shared_mem.getPointer();
327
328 temp[2 * threadID] = input[blockOffset + (2 * threadID)];
329 temp[2 * threadID + 1] = input[blockOffset + (2 * threadID) + 1];
330
331 int offset = 1;
332 for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
333 {
334 __syncthreads();
335 if (threadID < d)
336 {
337 int ai = offset * (2 * threadID + 1) - 1;
338 int bi = offset * (2 * threadID + 2) - 1;
339 temp[bi] += temp[ai];
340 }
341 offset *= 2;
342 }
343 __syncthreads();
344
345
346 if (threadID == 0) {
347 sums[blockID] = temp[n - 1];
348 temp[n - 1] = 0;
349 }
350
351 for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
352 {
353 offset >>= 1;
354 __syncthreads();
355 if (threadID < d)
356 {
357 int ai = offset * (2 * threadID + 1) - 1;
358 int bi = offset * (2 * threadID + 2) - 1;
359 int t = temp[ai];
360 temp[ai] = temp[bi];
361 temp[bi] += t;
362 }
363 }
364 __syncthreads();
365
366 output[blockOffset + (2 * threadID)] = temp[2 * threadID];
367 output[blockOffset + (2 * threadID) + 1] = temp[2 * threadID + 1];
368 }
369
370 template<typename T>
371 void Scan<T>::scanLargeEvenDeviceArray(T *output, const T *input, size_t length, bool bcao, size_t level)
372 {
373 const int blocks = length / SCAN_ELEMENTS_PER_BLOCK;
374 const int sharedMemArraySize = SCAN_ELEMENTS_PER_BLOCK * sizeof(T);
375
376 //The following code is used to avoid malloc GPU memory for each call
377 if (level < SCAN_LEVEL)
378 {
379 const int blocks = length / SCAN_ELEMENTS_PER_BLOCK;
380 const int sharedMemArraySize = SCAN_ELEMENTS_PER_BLOCK * sizeof(T);
381
382 if (m_sums[level].size() != blocks)
383 {
384 m_sums[level].resize(blocks);
385 m_incr[level].resize(blocks);
386 }
387
388 if (bcao) {
389 k_prescan_large << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, m_sums[level].begin());
390 cuSynchronize();
391 }
392 else {
393 k_prescan_large_unoptimized << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, m_sums[level].begin());
394 cuSynchronize();
395 }
396
397 const int sumsArrThreadsNeeded = (blocks + 1) / 2;
398 if (sumsArrThreadsNeeded > SCAN_THREADS_PER_BLOCK) {
399 // perform a large scan on the sums arr
400 scanLargeDeviceArray(m_incr[level].begin(), m_sums[level].begin(), blocks, bcao, level+1);
401 }
402 else {
403 // only need one block to scan sums arr so can use small scan
404 scanSmallDeviceArray(m_incr[level].begin(), m_sums[level].begin(), blocks, bcao);
405 }
406
407 k_add << <blocks, SCAN_ELEMENTS_PER_BLOCK >> > (output, SCAN_ELEMENTS_PER_BLOCK, m_incr[level].begin());
408 cuSynchronize();
409 }
410 else
411 {
412 T *d_sums, *d_incr;
413 cudaMalloc((void **)&d_sums, blocks * sizeof(T));
414 cudaMalloc((void **)&d_incr, blocks * sizeof(T));
415
416 if (bcao) {
417 k_prescan_large << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, d_sums);
418 cuSynchronize();
419 }
420 else {
421 k_prescan_large_unoptimized << <blocks, SCAN_THREADS_PER_BLOCK, 2 * sharedMemArraySize >> > (output, input, SCAN_ELEMENTS_PER_BLOCK, d_sums);
422 cuSynchronize();
423 }
424
425 const int sumsArrThreadsNeeded = (blocks + 1) / 2;
426 if (sumsArrThreadsNeeded > SCAN_THREADS_PER_BLOCK) {
427 // perform a large scan on the sums arr
428 scanLargeDeviceArray(d_incr, d_sums, blocks, bcao, level + 1);
429 }
430 else {
431 // only need one block to scan sums arr so can use small scan
432 scanSmallDeviceArray(d_incr, d_sums, blocks, bcao);
433 }
434
435 k_add << <blocks, SCAN_ELEMENTS_PER_BLOCK >> > (output, SCAN_ELEMENTS_PER_BLOCK, d_incr);
436 cuSynchronize();
437
438 cudaFree(d_sums);
439 cudaFree(d_incr);
440 }
441 }
442
443 template<typename T>
444 bool Scan<T>::isPowerOfTwo(size_t x)
445 {
446 return x && !(x & (x - 1));
447 }
448
449 template<typename T>
450 size_t Scan<T>::nextPowerOfTwo(size_t x)
451 {
452 int power = 1;
453 while (power < x) {
454 power *= 2;
455 }
456 return power;
457 }
458
459 template class Scan<int>;
460 template class Scan<uint>;
461}
462