PeriDyno 1.0.0
Loading...
Searching...
No Matches
VkFFT_Base.h
Go to the documentation of this file.
1// This file is part of VkFFT, a Vulkan Fast Fourier Transform library
2//
3// Copyright (C) 2020 - present Dmitrii Tolmachev <dtolm96@gmail.com>
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in
13// all copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21// THE SOFTWARE.
22
23#ifndef VKFFT_H
24#define VKFFT_H
25
26#include <memory.h>
27#include <math.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#ifndef __STDC_FORMAT_MACROS
32#define __STDC_FORMAT_MACROS
33#endif
34#define VKFFT_BACKEND 0
35#include <inttypes.h>
36#if(VKFFT_BACKEND==0)
37#if defined(VK_USE_PLATFORM_ANDROID_KHR)
38#include <android/native_activity.h>
39#include <android/asset_manager.h>
40#include <android_native_app_glue.h>
41#include <sys/system_properties.h>
42#include "VulkanAndroid.h"
43#endif
44#include "vulkan/vulkan.h"
45#include "glslang_c_interface.h"
46#elif(VKFFT_BACKEND==1)
47#include <nvrtc.h>
48#include <cuda.h>
49#include <cuda_runtime.h>
50#include <cuda_runtime_api.h>
51#include <cuComplex.h>
52#elif(VKFFT_BACKEND==2)
53#include <hip/hiprtc.h>
54#include <hip/hip_runtime.h>
55#include <hip/hip_runtime_api.h>
56#include <hip/hip_complex.h>
57#elif(VKFFT_BACKEND==3)
58#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
59#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
60#endif
61#ifdef __APPLE__
62#include <OpenCL/opencl.h>
63#else
64#include <CL/cl.h>
65#endif
66#endif
67
68#include "VkFFT_Defs.h"
69
71 //appends code line stored in tempStr to generated code
74 sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", sc->tempStr);
75 return VKFFT_SUCCESS;
76};
78 //appends code line stored in tempStr to generated code
79 if (sc->currentLen + (int64_t)strlen(in) > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER;
80 sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", in);
81 return VKFFT_SUCCESS;
82};
85 sc->tempLen = sprintf(sc->tempStr, "\
86// This file is part of VkFFT, a Vulkan Fast Fourier Transform library\n\
87//\n\
88// Copyright (C) 2020 - present Dmitrii Tolmachev <dtolm96@gmail.com>\n\
89//\n\
90// Permission is hereby granted, free of charge, to any person obtaining a copy\n\
91// of this software and associated documentation files (the \"Software\"), to deal\n\
92// in the Software without restriction, including without limitation the rights\n\
93// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n\
94// copies of the Software, and to permit persons to whom the Software is\n\
95// furnished to do so, subject to the following conditions:\n\
96//\n\
97// The above copyright notice and this permission notice shall be included in\n\
98// all copies or substantial portions of the Software.\n\
99//\n\
100// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n\
101// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n\
102// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n\
103// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\
104// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n\
105// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n\
106// THE SOFTWARE.\n");
107 res = VkAppendLine(sc);
108 if (res != VKFFT_SUCCESS) return res;
109 return res;
110}
111static inline VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) {
113 sc->tempLen = sprintf(sc->tempStr, "\
114 %s = %s;\n", out, in);
115 res = VkAppendLine(sc);
116 if (res != VKFFT_SUCCESS) return res;
117 return res;
118};
119static inline VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) {
121 sc->tempLen = sprintf(sc->tempStr, "\
122 %s = %s;\n", out, in);
123 res = VkAppendLine(sc);
124 if (res != VKFFT_SUCCESS) return res;
125 return res;
126};
127static inline VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout* sc, const char* id, const char* in) {
129 sc->tempLen = sprintf(sc->tempStr, "\
130 sdata[%s] = %s;\n", id, in);
131 res = VkAppendLine(sc);
132 if (res != VKFFT_SUCCESS) return res;
133 return res;
134};
135static inline VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* id) {
137 sc->tempLen = sprintf(sc->tempStr, "\
138 %s = sdata[%s];\n", out, id);
139 res = VkAppendLine(sc);
140 if (res != VKFFT_SUCCESS) return res;
141 return res;
142};
143static inline VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
145 sc->tempLen = sprintf(sc->tempStr, "\
146 %s = %s + %s;\n", out, in_1, in_2);
147 res = VkAppendLine(sc);
148 if (res != VKFFT_SUCCESS) return res;
149 return res;
150};
151static inline VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
153 sc->tempLen = sprintf(sc->tempStr, "\
154 %s.x = %s.x + %s.x;\n\
155 %s.y = %s.y + %s.y;\n", out, in_1, in_2, out, in_1, in_2);
156 res = VkAppendLine(sc);
157 if (res != VKFFT_SUCCESS) return res;
158 return res;
159};
160static inline VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
162 sc->tempLen = sprintf(sc->tempStr, "\
163 %s.x = - %s.x - %s.x;\n\
164 %s.y = - %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
165 res = VkAppendLine(sc);
166 if (res != VKFFT_SUCCESS) return res;
167 return res;
168};
169static inline VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
171 sc->tempLen = sprintf(sc->tempStr, "\
172 %s.x = %s.x - %s.x;\n\
173 %s.y = %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
174 res = VkAppendLine(sc);
175 if (res != VKFFT_SUCCESS) return res;
176 return res;
177};
178static inline VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
180 sc->tempLen = sprintf(sc->tempStr, "\
181 %s = %s - %s;\n", out, in_1, in_2);
182 res = VkAppendLine(sc);
183 if (res != VKFFT_SUCCESS) return res;
184 return res;
185};
186static inline VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) {
188 sc->tempLen = sprintf(sc->tempStr, "\
189 %s.x = fma(%s.x, %s, %s.x);\n\
190 %s.y = fma(%s.y, %s, %s.y);\n", out, in_1, in_num, in_2, out, in_1, in_num, in_2);
191 res = VkAppendLine(sc);
192 if (res != VKFFT_SUCCESS) return res;
193 return res;
194};
195static inline VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) {
197 sc->tempLen = sprintf(sc->tempStr, "\
198 %s = fma(%s, %s, %s);\n", out, in_1, in_num, in_2);
199 res = VkAppendLine(sc);
200 if (res != VKFFT_SUCCESS) return res;
201 return res;
202};
203static inline VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
205 if (strcmp(out, in_1) && strcmp(out, in_2)) {
206 sc->tempLen = sprintf(sc->tempStr, "\
207 %s.x = %s.x * %s.x - %s.y * %s.y;\n\
208 %s.y = %s.y * %s.x + %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
209 }
210 else {
211 if (temp) {
212 sc->tempLen = sprintf(sc->tempStr, "\
213 %s.x = %s.x * %s.x - %s.y * %s.y;\n\
214 %s.y = %s.y * %s.x + %s.x * %s.y;\n\
215 %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
216 }
217 else
219 }
220 res = VkAppendLine(sc);
221 if (res != VKFFT_SUCCESS) return res;
222 return res;
223};
224static inline VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
226 if (strcmp(out, in_1) && strcmp(out, in_2)) {
227 sc->tempLen = sprintf(sc->tempStr, "\
228 %s.x = %s.x * %s.x + %s.y * %s.y;\n\
229 %s.y = %s.y * %s.x - %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
230 }
231 else {
232 if (temp) {
233 sc->tempLen = sprintf(sc->tempStr, "\
234 %s.x = %s.x * %s.x + %s.y * %s.y;\n\
235 %s.y = %s.y * %s.x - %s.x * %s.y;\n\
236 %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
237 }
238 else
240 }
241 res = VkAppendLine(sc);
242 if (res != VKFFT_SUCCESS) return res;
243 return res;
244};
245static inline VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
247 sc->tempLen = sprintf(sc->tempStr, "\
248 %s.x = %s.x * %s;\n\
249 %s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num);
250 res = VkAppendLine(sc);
251 if (res != VKFFT_SUCCESS) return res;
252 return res;
253};
254static inline VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* temp) {
256 if (strcmp(out, in_1)) {
257 sc->tempLen = sprintf(sc->tempStr, "\
258 %s.x = - %s.y * %s;\n\
259 %s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num);
260 }
261 else {
262 if (temp) {
263 sc->tempLen = sprintf(sc->tempStr, "\
264 %s.x = - %s.y * %s;\n\
265 %s.y = %s.x * %s;\n\
266 %s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp);
267 }
268 else
270 }
271 res = VkAppendLine(sc);
272 if (res != VKFFT_SUCCESS) return res;
273 return res;
274};
275static inline VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
277 sc->tempLen = sprintf(sc->tempStr, "\
278 %s.x = %s.x / %s;\n\
279 %s.y = %s.y / %s;\n", out, in_1, in_num, out, in_1, in_num);
280 res = VkAppendLine(sc);
281 if (res != VKFFT_SUCCESS) return res;
282 return res;
283};
284
285static inline VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
287 sc->tempLen = sprintf(sc->tempStr, "\
288 %s = %s * %s;\n", out, in_1, in_2);
289 res = VkAppendLine(sc);
290 if (res != VKFFT_SUCCESS) return res;
291 return res;
292};
293
294static inline VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
296 if (strcmp(out, in_2)) {
297 sc->tempLen = sprintf(sc->tempStr, "\
298 %s.x = %s.x - %s.y;\n\
299 %s.y = %s.y + %s.x;\n", out, in_1, in_2, out, in_1, in_2);
300 }
301 else {
302 if (temp) {
303 sc->tempLen = sprintf(sc->tempStr, "\
304 %s.x = %s.x - %s.y;\n\
305 %s.y = %s.x + %s.y;\n\
306 %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
307 }
308 else
310 }
311 res = VkAppendLine(sc);
312 if (res != VKFFT_SUCCESS) return res;
313 return res;
314};
315static inline VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
317 if (strcmp(out, in_2)) {
318 sc->tempLen = sprintf(sc->tempStr, "\
319 %s.x = %s.x + %s.y;\n\
320 %s.y = %s.y - %s.x;\n", out, in_1, in_2, out, in_1, in_2);
321 }
322 else {
323 if (temp) {
324 sc->tempLen = sprintf(sc->tempStr, "\
325 %s.x = %s.x + %s.y;\n\
326 %s.y = %s.x - %s.y;\n\
327 %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
328 }
329 else
331 }
332 res = VkAppendLine(sc);
333 if (res != VKFFT_SUCCESS) return res;
334 return res;
335};
336static inline VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
338 sc->tempLen = sprintf(sc->tempStr, "\
339 %s = %s %% %s;\n", out, in_1, in_num);
340 res = VkAppendLine(sc);
341 if (res != VKFFT_SUCCESS) return res;
342 return res;
343};
344static inline VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
346 sc->tempLen = sprintf(sc->tempStr, "\
347 %s = %s / %s;\n", out, in_1, in_num);
348 res = VkAppendLine(sc);
349 if (res != VKFFT_SUCCESS) return res;
350 return res;
351};
352static inline VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout* sc, const uint64_t* permute, const uint64_t num_elem, const uint64_t type, char** regIDs) {
354 char temp_ID[13][20];
355 if (type == 0) {
356 for (uint64_t i = 0; i < num_elem; i++)
357 sprintf(temp_ID[i], "%s", sc->locID[i]);
358 for (uint64_t i = 0; i < num_elem; i++)
359 sprintf(sc->locID[i], "%s", temp_ID[permute[i]]);
360 }
361 if (type == 1) {
362 for (uint64_t i = 0; i < num_elem; i++)
363 sprintf(temp_ID[i], "%s", regIDs[i]);
364 for (uint64_t i = 0; i < num_elem; i++)
365 sprintf(regIDs[i], "%s", temp_ID[permute[i]]);
366 }
367 return res;
368};
369
370static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration);
371static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams);
372
375#if(VKFFT_BACKEND==0)
376 sc->tempLen = sprintf(sc->tempStr, "#version 450\n\n");
377 res = VkAppendLine(sc);
378 if (res != VKFFT_SUCCESS) return res;
379#endif
380 return res;
381}
382static inline VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory) {
384#if(VKFFT_BACKEND==0)
385 //sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_debug_printf : require\n\n");
386 //res = VkAppendLine(sc);
387 //if (res != VKFFT_SUCCESS) return res;
388
389 if ((!strcmp(floatType, "double")) || (sc->useUint64)) {
390 sc->tempLen = sprintf(sc->tempStr, "\
391#extension GL_ARB_gpu_shader_fp64 : enable\n\
392#extension GL_ARB_gpu_shader_int64 : enable\n\n");
393 res = VkAppendLine(sc);
394 if (res != VKFFT_SUCCESS) return res;
395 }
396 if ((!strcmp(floatTypeInputMemory, "half")) || (!strcmp(floatTypeOutputMemory, "half")) || (!strcmp(floatTypeKernelMemory, "half"))) {
397 sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_shader_16bit_storage : require\n\n");
398 res = VkAppendLine(sc);
399 if (res != VKFFT_SUCCESS) return res;
400 }
401#elif(VKFFT_BACKEND==1)
402#elif(VKFFT_BACKEND==2)
403#ifdef VKFFT_OLD_ROCM
404 sc->tempLen = sprintf(sc->tempStr, "\
405#include <hip/hip_runtime.h>\n");
406 res = VkAppendLine(sc);
407 if (res != VKFFT_SUCCESS) return res;
408#endif
409#elif(VKFFT_BACKEND==3)
410 if ((!strcmp(floatType, "double")) || (sc->useUint64)) {
411 sc->tempLen = sprintf(sc->tempStr, "\
412#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\
413#pragma OPENCL EXTENSION cl_khr_int64 : enable\n\n");
414 res = VkAppendLine(sc);
415 if (res != VKFFT_SUCCESS) return res;
416 }
417#endif
418 return res;
419}
422#if(VKFFT_BACKEND==0)
423 sc->tempLen = sprintf(sc->tempStr, "layout (local_size_x = %" PRIu64 ", local_size_y = %" PRIu64 ", local_size_z = %" PRIu64 ") in;\n", sc->localSize[0], sc->localSize[1], sc->localSize[2]);
424 res = VkAppendLine(sc);
425 if (res != VKFFT_SUCCESS) return res;
426#elif(VKFFT_BACKEND==1)
427#elif(VKFFT_BACKEND==2)
428#elif(VKFFT_BACKEND==3)
429#endif
430 return res;
431}
432static inline VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name, const char* defaultVal, const char* LFending) {
434#if(VKFFT_BACKEND==3)
435 sc->tempLen = sprintf(sc->tempStr, "__constant %s %s = %s%s;\n", type, name, defaultVal, LFending);
436 res = VkAppendLine(sc);
437 if (res != VKFFT_SUCCESS) return res;
438#else
439 sc->tempLen = sprintf(sc->tempStr, "const %s %s = %s%s;\n", type, name, defaultVal, LFending);
440 res = VkAppendLine(sc);
441 if (res != VKFFT_SUCCESS) return res;
442#endif
443 return res;
444}
445static inline VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name) {
447 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", type, name);
448 res = VkAppendLine(sc);
449 if (res != VKFFT_SUCCESS) return res;
450 return res;
451}
454 char tabs[100];
455 for (uint64_t i = 0; i < numTab; i++)
456 sprintf(tabs, " ");
457#if(VKFFT_BACKEND==0)
458 sc->tempLen = sprintf(sc->tempStr, "%sbarrier();\n\n", tabs);
459 res = VkAppendLine(sc);
460 if (res != VKFFT_SUCCESS) return res;
461#elif(VKFFT_BACKEND==1)
462 sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs);
463 res = VkAppendLine(sc);
464 if (res != VKFFT_SUCCESS) return res;
465#elif(VKFFT_BACKEND==2)
466 sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs);
467 res = VkAppendLine(sc);
468 if (res != VKFFT_SUCCESS) return res;
469#elif(VKFFT_BACKEND==3)
470 sc->tempLen = sprintf(sc->tempStr, "%sbarrier(CLK_LOCAL_MEM_FENCE);\n\n", tabs);
471 res = VkAppendLine(sc);
472 if (res != VKFFT_SUCCESS) return res;
473#endif
474 return res;
475}
476static inline VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
478#if(VKFFT_BACKEND==0)
479 sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n");
480 res = VkAppendLine(sc);
481 if (res != VKFFT_SUCCESS) return res;
482 res = appendPushConstant(sc, uintType, "workGroupShiftX");
483 if (res != VKFFT_SUCCESS) return res;
484 res = appendPushConstant(sc, uintType, "workGroupShiftY");
485 if (res != VKFFT_SUCCESS) return res;
486 res = appendPushConstant(sc, uintType, "workGroupShiftZ");
487 if (res != VKFFT_SUCCESS) return res;
488 sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n");
489 res = VkAppendLine(sc);
490 if (res != VKFFT_SUCCESS) return res;
491#elif(VKFFT_BACKEND==1)
492 sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
493 res = VkAppendLine(sc);
494 if (res != VKFFT_SUCCESS) return res;
495 res = appendPushConstant(sc, uintType, "workGroupShiftX");
496 if (res != VKFFT_SUCCESS) return res;
497 res = appendPushConstant(sc, uintType, "workGroupShiftY");
498 if (res != VKFFT_SUCCESS) return res;
499 res = appendPushConstant(sc, uintType, "workGroupShiftZ");
500 if (res != VKFFT_SUCCESS) return res;
501 sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
502 res = VkAppendLine(sc);
503 if (res != VKFFT_SUCCESS) return res;
504 sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
505 res = VkAppendLine(sc);
506 if (res != VKFFT_SUCCESS) return res;
507#elif(VKFFT_BACKEND==2)
508 sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
509 res = VkAppendLine(sc);
510 if (res != VKFFT_SUCCESS) return res;
511 res = appendPushConstant(sc, uintType, "workGroupShiftX");
512 if (res != VKFFT_SUCCESS) return res;
513 res = appendPushConstant(sc, uintType, "workGroupShiftY");
514 if (res != VKFFT_SUCCESS) return res;
515 res = appendPushConstant(sc, uintType, "workGroupShiftZ");
516 if (res != VKFFT_SUCCESS) return res;
517 sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
518 res = VkAppendLine(sc);
519 if (res != VKFFT_SUCCESS) return res;
520 sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
521 res = VkAppendLine(sc);
522 if (res != VKFFT_SUCCESS) return res;
523#elif(VKFFT_BACKEND==3)
524 sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
525 res = VkAppendLine(sc);
526 if (res != VKFFT_SUCCESS) return res;
527 res = appendPushConstant(sc, uintType, "workGroupShiftX");
528 if (res != VKFFT_SUCCESS) return res;
529 res = appendPushConstant(sc, uintType, "workGroupShiftY");
530 if (res != VKFFT_SUCCESS) return res;
531 res = appendPushConstant(sc, uintType, "workGroupShiftZ");
532 if (res != VKFFT_SUCCESS) return res;
533 sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
534 res = VkAppendLine(sc);
535 if (res != VKFFT_SUCCESS) return res;
536 //VkAppendLine(sc, " __constant PushConsts consts;\n");
537#endif
538 return res;
539}
540static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
542 char LFending[4] = "";
543 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
544#if(VKFFT_BACKEND==0)
545 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
546#elif(VKFFT_BACKEND==1)
547 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
548#elif(VKFFT_BACKEND==2)
549 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
550#elif(VKFFT_BACKEND==3)
551 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
552#endif
553 res = appendConstant(sc, floatType, "loc_PI", "3.1415926535897932384626433832795", LFending);
554 if (res != VKFFT_SUCCESS) return res;
555 res = appendConstant(sc, floatType, "loc_SQRT1_2", "0.70710678118654752440084436210485", LFending);
556 if (res != VKFFT_SUCCESS) return res;
557 return res;
558}
559static inline VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
561 char functionDefinitions[100] = "";
562 char vecType[30];
563 char LFending[4] = "";
564 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
565#if(VKFFT_BACKEND==0)
566 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
567 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
568 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
569 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
570#elif(VKFFT_BACKEND==1)
571 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
572 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
573 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
574 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
575 sprintf(functionDefinitions, "__device__ static __inline__ ");
576#elif(VKFFT_BACKEND==2)
577 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
578 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
579 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
580 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
581 sprintf(functionDefinitions, "__device__ static __inline__ ");
582#elif(VKFFT_BACKEND==3)
583 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
584 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
585 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
586 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
587 sprintf(functionDefinitions, "static __inline__ ");
588#endif
589 res = appendConstant(sc, floatType, "loc_2_PI", "0.63661977236758134307553505349006", LFending);
590 if (res != VKFFT_SUCCESS) return res;
591 res = appendConstant(sc, floatType, "loc_PI_2", "1.5707963267948966192313216916398", LFending);
592 if (res != VKFFT_SUCCESS) return res;
593 res = appendConstant(sc, floatType, "a1", "0.99999999999999999999962122687403772", LFending);
594 if (res != VKFFT_SUCCESS) return res;
595 res = appendConstant(sc, floatType, "a3", "-0.166666666666666666637194166219637268", LFending);
596 if (res != VKFFT_SUCCESS) return res;
597 res = appendConstant(sc, floatType, "a5", "0.00833333333333333295212653322266277182", LFending);
598 if (res != VKFFT_SUCCESS) return res;
599 res = appendConstant(sc, floatType, "a7", "-0.000198412698412696489459896530659927773", LFending);
600 if (res != VKFFT_SUCCESS) return res;
601 res = appendConstant(sc, floatType, "a9", "2.75573192239364018847578909205399262e-6", LFending);
602 if (res != VKFFT_SUCCESS) return res;
603 res = appendConstant(sc, floatType, "a11", "-2.50521083781017605729370231280411712e-8", LFending);
604 if (res != VKFFT_SUCCESS) return res;
605 res = appendConstant(sc, floatType, "a13", "1.60590431721336942356660057796782021e-10", LFending);
606 if (res != VKFFT_SUCCESS) return res;
607 res = appendConstant(sc, floatType, "a15", "-7.64712637907716970380859898835680587e-13", LFending);
608 if (res != VKFFT_SUCCESS) return res;
609 res = appendConstant(sc, floatType, "a17", "2.81018528153898622636194976499656274e-15", LFending);
610 if (res != VKFFT_SUCCESS) return res;
611 res = appendConstant(sc, floatType, "ab", "-7.97989713648499642889739108679114937e-18", LFending);
612 if (res != VKFFT_SUCCESS) return res;
613 sc->tempLen = sprintf(sc->tempStr, "\
614%s%s sincos_20(double x)\n\
615{\n\
616 //minimax coefs for sin for 0..pi/2 range\n\
617 double y = abs(x * loc_2_PI);\n\
618 double q = floor(y);\n\
619 int quadrant = int(q);\n\
620 double t = (quadrant & 1) != 0 ? 1 - y + q : y - q;\n\
621 t *= loc_PI_2;\n\
622 double t2 = t * t;\n\
623 double r = fma(fma(fma(fma(fma(fma(fma(fma(fma(ab, t2, a17), t2, a15), t2, a13), t2, a11), t2, a9), t2, a7), t2, a5), t2, a3), t2 * t, t);\n\
624 %s cos_sin;\n\
625 cos_sin.x = ((quadrant == 0) || (quadrant == 3)) ? sqrt(1 - r * r) : -sqrt(1 - r * r);\n\
626 r = x < 0 ? -r : r;\n\
627 cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\
628 return cos_sin;\n\
629}\n\n", functionDefinitions, vecType, vecType);
630 res = VkAppendLine(sc);
631 if (res != VKFFT_SUCCESS) return res;
632 return res;
633}
634static inline VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeDifferent) {
636#if(VKFFT_BACKEND!=0)
637 char functionDefinitions[100] = "";
638 char vecType[30];
639 char vecTypeDifferent[30];
640#endif
641#if(VKFFT_BACKEND==0)
642#elif(VKFFT_BACKEND==1)
643 sprintf(functionDefinitions, "__device__ static __inline__ ");
644#elif(VKFFT_BACKEND==2)
645 sprintf(functionDefinitions, "__device__ static __inline__ ");
646#elif(VKFFT_BACKEND==3)
647 sprintf(functionDefinitions, "static __inline__ ");
648#endif
649#if(VKFFT_BACKEND!=0)
650 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
651 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
652 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
653 if (!strcmp(floatTypeDifferent, "half")) sprintf(vecTypeDifferent, "f16vec2");
654 if (!strcmp(floatTypeDifferent, "float")) sprintf(vecTypeDifferent, "float2");
655 if (!strcmp(floatTypeDifferent, "double")) sprintf(vecTypeDifferent, "double2");
656 sc->tempLen = sprintf(sc->tempStr, "\
657%s%s conv_%s(%s input)\n\
658{\n\
659 %s ret_val;\n\
660 ret_val.x = (%s) input.x;\n\
661 ret_val.y = (%s) input.y;\n\
662 return ret_val;\n\
663}\n\n", functionDefinitions, vecType, vecType, vecTypeDifferent, vecType, floatType, floatType);
664 res = VkAppendLine(sc);
665 if (res != VKFFT_SUCCESS) return res;
666 sc->tempLen = sprintf(sc->tempStr, "\
667%s%s conv_%s(%s input)\n\
668{\n\
669 %s ret_val;\n\
670 ret_val.x = (%s) input.x;\n\
671 ret_val.y = (%s) input.y;\n\
672 return ret_val;\n\
673}\n\n", functionDefinitions, vecTypeDifferent, vecTypeDifferent, vecType, vecTypeDifferent, floatTypeDifferent, floatTypeDifferent);
674 res = VkAppendLine(sc);
675 if (res != VKFFT_SUCCESS) return res;
676#endif
677 return res;
678}
679static inline VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t inputType) {
681 char vecType[30];
682 switch (inputType) {
683 case 0: case 1: case 2: case 3: case 4: case 6: {
684#if(VKFFT_BACKEND==0)
685 if (!strcmp(floatTypeMemory, "half")) {
686 sc->inputNumberByteSize = 2 * 2;
687 sprintf(vecType, "f16vec2");
688 }
689 if (!strcmp(floatTypeMemory, "float")) {
690 sc->inputNumberByteSize = 2 * sizeof(float);
691 sprintf(vecType, "vec2");
692 }
693 if (!strcmp(floatTypeMemory, "double")) {
694 sc->inputNumberByteSize = 2 * sizeof(double);
695 sprintf(vecType, "dvec2");
696 }
697 if (sc->inputBufferBlockNum == 1) {
698 sc->tempLen = sprintf(sc->tempStr, "\
699layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
700 %s inputs[%" PRIu64 "];\n\
701};\n\n", id, vecType, sc->inputBufferBlockSize);
702 res = VkAppendLine(sc);
703 if (res != VKFFT_SUCCESS) return res;
704 }
705 else {
706 sc->tempLen = sprintf(sc->tempStr, "\
707layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
708 %s inputs[%" PRIu64 "];\n\
709} inputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->inputBufferBlockSize, sc->inputBufferBlockNum);
710 res = VkAppendLine(sc);
711 if (res != VKFFT_SUCCESS) return res;
712 }
713#elif(VKFFT_BACKEND==1)
714 if (!strcmp(floatTypeMemory, "half")) {
715 sc->inputNumberByteSize = 2 * 2;
716 sprintf(vecType, "f16vec2");
717 }
718 if (!strcmp(floatTypeMemory, "float")) {
719 sc->inputNumberByteSize = 2 * sizeof(float);
720 sprintf(vecType, "float2");
721 }
722 if (!strcmp(floatTypeMemory, "double")) {
723 sc->inputNumberByteSize = 2 * sizeof(double);
724 sprintf(vecType, "double2");
725 }
726#elif(VKFFT_BACKEND==2)
727 if (!strcmp(floatTypeMemory, "half")) {
728 sc->inputNumberByteSize = 2 * 2;
729 sprintf(vecType, "f16vec2");
730 }
731 if (!strcmp(floatTypeMemory, "float")) {
732 sc->inputNumberByteSize = 2 * sizeof(float);
733 sprintf(vecType, "float2");
734 }
735 if (!strcmp(floatTypeMemory, "double")) {
736 sc->inputNumberByteSize = 2 * sizeof(double);
737 sprintf(vecType, "double2");
738 }
739#elif(VKFFT_BACKEND==3)
740 if (!strcmp(floatTypeMemory, "half")) {
741 sc->inputNumberByteSize = 2 * 2;
742 sprintf(vecType, "f16vec2");
743 }
744 if (!strcmp(floatTypeMemory, "float")) {
745 sc->inputNumberByteSize = 2 * sizeof(float);
746 sprintf(vecType, "float2");
747 }
748 if (!strcmp(floatTypeMemory, "double")) {
749 sc->inputNumberByteSize = 2 * sizeof(double);
750 sprintf(vecType, "double2");
751 }
752#endif
753 break;
754 }
755 case 5: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
756 {
757 if (!strcmp(floatTypeMemory, "half")) {
758 sc->inputNumberByteSize = 2;
759 sprintf(vecType, "float16_t");
760 }
761 if (!strcmp(floatTypeMemory, "float")) {
762 sc->inputNumberByteSize = sizeof(float);
763 sprintf(vecType, "float");
764 }
765 if (!strcmp(floatTypeMemory, "double")) {
766 sc->inputNumberByteSize = sizeof(double);
767 sprintf(vecType, "double");
768 }
769#if(VKFFT_BACKEND==0)
770 if (sc->inputBufferBlockNum == 1) {
771 sc->tempLen = sprintf(sc->tempStr, "\
772layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
773 %s inputs[%" PRIu64 "];\n\
774};\n\n", id, vecType, 2 * sc->inputBufferBlockSize);
775 res = VkAppendLine(sc);
776 if (res != VKFFT_SUCCESS) return res;
777 }
778 else {
779 sc->tempLen = sprintf(sc->tempStr, "\
780layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
781 %s inputs[%" PRIu64 "];\n\
782} inputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->inputBufferBlockSize, sc->inputBufferBlockNum);
783 res = VkAppendLine(sc);
784 if (res != VKFFT_SUCCESS) return res;
785 }
786#endif
787 break;
788 }
789 }
790 return res;
791}
792static inline VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t outputType) {
794 char vecType[30];
795 switch (outputType) {
796 case 0: case 1: case 2: case 3: case 4: case 5: {
797#if(VKFFT_BACKEND==0)
798 if (!strcmp(floatTypeMemory, "half")) {
799 sc->outputNumberByteSize = 2 * 2;
800 sprintf(vecType, "f16vec2");
801 }
802 if (!strcmp(floatTypeMemory, "float")) {
803 sc->outputNumberByteSize = 2 * sizeof(float);
804 sprintf(vecType, "vec2");
805 }
806 if (!strcmp(floatTypeMemory, "double")) {
807 sc->outputNumberByteSize = 2 * sizeof(double);
808 sprintf(vecType, "dvec2");
809 }
810 if (sc->outputBufferBlockNum == 1) {
811 sc->tempLen = sprintf(sc->tempStr, "\
812layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
813 %s outputs[%" PRIu64 "];\n\
814};\n\n", id, vecType, sc->outputBufferBlockSize);
815 res = VkAppendLine(sc);
816 if (res != VKFFT_SUCCESS) return res;
817 }
818 else {
819 sc->tempLen = sprintf(sc->tempStr, "\
820layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
821 %s outputs[%" PRIu64 "];\n\
822} outputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->outputBufferBlockSize, sc->outputBufferBlockNum);
823 res = VkAppendLine(sc);
824 if (res != VKFFT_SUCCESS) return res;
825 }
826#elif(VKFFT_BACKEND==1)
827 if (!strcmp(floatTypeMemory, "half")) {
828 sc->outputNumberByteSize = 2 * 2;
829 sprintf(vecType, "f16vec2");
830 }
831 if (!strcmp(floatTypeMemory, "float")) {
832 sc->outputNumberByteSize = 2 * sizeof(float);
833 sprintf(vecType, "float2");
834 }
835 if (!strcmp(floatTypeMemory, "double")) {
836 sc->outputNumberByteSize = 2 * sizeof(double);
837 sprintf(vecType, "double2");
838 }
839#elif(VKFFT_BACKEND==2)
840 if (!strcmp(floatTypeMemory, "half")) {
841 sc->outputNumberByteSize = 2 * 2;
842 sprintf(vecType, "f16vec2");
843 }
844 if (!strcmp(floatTypeMemory, "float")) {
845 sc->outputNumberByteSize = 2 * sizeof(float);
846 sprintf(vecType, "float2");
847 }
848 if (!strcmp(floatTypeMemory, "double")) {
849 sc->outputNumberByteSize = 2 * sizeof(double);
850 sprintf(vecType, "double2");
851 }
852#elif(VKFFT_BACKEND==3)
853 if (!strcmp(floatTypeMemory, "half")) {
854 sc->outputNumberByteSize = 2 * 2;
855 sprintf(vecType, "f16vec2");
856 }
857 if (!strcmp(floatTypeMemory, "float")) {
858 sc->outputNumberByteSize = 2 * sizeof(float);
859 sprintf(vecType, "float2");
860 }
861 if (!strcmp(floatTypeMemory, "double")) {
862 sc->outputNumberByteSize = 2 * sizeof(double);
863 sprintf(vecType, "double2");
864 }
865#endif
866 break;
867 }
868 case 6: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
869 {
870 if (!strcmp(floatTypeMemory, "half")) {
871 sc->outputNumberByteSize = 2;
872 sprintf(vecType, "float16_t");
873 }
874 if (!strcmp(floatTypeMemory, "float")) {
875 sc->outputNumberByteSize = sizeof(float);
876 sprintf(vecType, "float");
877 }
878 if (!strcmp(floatTypeMemory, "double")) {
879 sc->outputNumberByteSize = sizeof(double);
880 sprintf(vecType, "double");
881 }
882#if(VKFFT_BACKEND==0)
883 if (sc->outputBufferBlockNum == 1) {
884 sc->tempLen = sprintf(sc->tempStr, "\
885layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
886 %s outputs[%" PRIu64 "];\n\
887};\n\n", id, vecType, 2 * sc->outputBufferBlockSize);
888 res = VkAppendLine(sc);
889 if (res != VKFFT_SUCCESS) return res;
890 }
891 else {
892 sc->tempLen = sprintf(sc->tempStr, "\
893layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
894 %s outputs[%" PRIu64 "];\n\
895} outputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->outputBufferBlockSize, sc->outputBufferBlockNum);
896 res = VkAppendLine(sc);
897 if (res != VKFFT_SUCCESS) return res;
898 }
899#endif
900 break;
901 }
902 }
903 return res;
904}
905static inline VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory) {
907 char vecType[30];
908#if(VKFFT_BACKEND==0)
909 if (!strcmp(floatTypeMemory, "half")) {
910 sc->kernelNumberByteSize = 2 * 2;
911 sprintf(vecType, "f16vec2");
912 }
913 if (!strcmp(floatTypeMemory, "float")) {
914 sc->kernelNumberByteSize = 2 * sizeof(float);
915 sprintf(vecType, "vec2");
916 }
917 if (!strcmp(floatTypeMemory, "double")) {
918 sc->kernelNumberByteSize = 2 * sizeof(double);
919 sprintf(vecType, "dvec2");
920 }
921 if (sc->kernelBlockNum == 1) {
922 sc->tempLen = sprintf(sc->tempStr, "\
923layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\
924 %s kernel_obj[%" PRIu64 "];\n\
925};\n\n", id, vecType, sc->kernelBlockSize);
926 res = VkAppendLine(sc);
927 if (res != VKFFT_SUCCESS) return res;
928 }
929 else {
930 sc->tempLen = sprintf(sc->tempStr, "\
931layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\
932 %s kernel_obj[%" PRIu64 "];\n\
933} kernelBlocks[%" PRIu64 "];\n\n", id, vecType, sc->kernelBlockSize, sc->kernelBlockNum);
934 res = VkAppendLine(sc);
935 if (res != VKFFT_SUCCESS) return res;
936 }
937#elif(VKFFT_BACKEND==1)
938 if (!strcmp(floatTypeMemory, "half")) {
939 sc->kernelNumberByteSize = 2 * 2;
940 sprintf(vecType, "f16vec2");
941 }
942 if (!strcmp(floatTypeMemory, "float")) {
943 sc->kernelNumberByteSize = 2 * sizeof(float);
944 sprintf(vecType, "float2");
945 }
946 if (!strcmp(floatTypeMemory, "double")) {
947 sc->kernelNumberByteSize = 2 * sizeof(double);
948 sprintf(vecType, "double2");
949 }
950#elif(VKFFT_BACKEND==2)
951 if (!strcmp(floatTypeMemory, "half")) {
952 sc->kernelNumberByteSize = 2 * 2;
953 sprintf(vecType, "f16vec2");
954 }
955 if (!strcmp(floatTypeMemory, "float")) {
956 sc->kernelNumberByteSize = 2 * sizeof(float);
957 sprintf(vecType, "float2");
958 }
959 if (!strcmp(floatTypeMemory, "double")) {
960 sc->kernelNumberByteSize = 2 * sizeof(double);
961 sprintf(vecType, "double2");
962 }
963#elif(VKFFT_BACKEND==3)
964 if (!strcmp(floatTypeMemory, "half")) {
965 sc->kernelNumberByteSize = 2 * 2;
966 sprintf(vecType, "f16vec2");
967 }
968 if (!strcmp(floatTypeMemory, "float")) {
969 sc->kernelNumberByteSize = 2 * sizeof(float);
970 sprintf(vecType, "float2");
971 }
972 if (!strcmp(floatTypeMemory, "double")) {
973 sc->kernelNumberByteSize = 2 * sizeof(double);
974 sprintf(vecType, "double2");
975 }
976#endif
977 return res;
978}
979static inline VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) {
981 char vecType[30];
982#if(VKFFT_BACKEND==0)
983 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
984 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
985 sc->tempLen = sprintf(sc->tempStr, "\
986layout(std430, binding = %" PRIu64 ") readonly buffer DataLUT {\n\
987%s twiddleLUT[];\n\
988};\n", id, vecType);
989 res = VkAppendLine(sc);
990 if (res != VKFFT_SUCCESS) return res;
991#elif(VKFFT_BACKEND==1)
992 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
993 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
994#elif(VKFFT_BACKEND==2)
995 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
996 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
997#elif(VKFFT_BACKEND==3)
998 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
999 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1000#endif
1001 return res;
1002}
1003static inline VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) {
1005 char vecType[30];
1006 uint64_t loc_id = id;
1007#if(VKFFT_BACKEND==0)
1008 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
1009 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
1010 if (sc->BluesteinConvolutionStep) {
1011 sc->tempLen = sprintf(sc->tempStr, "\
1012layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinConvolutionKernel {\n\
1013%s BluesteinConvolutionKernel[];\n\
1014};\n", loc_id, vecType);
1015 res = VkAppendLine(sc);
1016 if (res != VKFFT_SUCCESS) return res;
1017 loc_id++;
1018 }
1020 sc->tempLen = sprintf(sc->tempStr, "\
1021layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinMultiplication {\n\
1022%s BluesteinMultiplication[];\n\
1023};\n", loc_id, vecType);
1024 res = VkAppendLine(sc);
1025 if (res != VKFFT_SUCCESS) return res;
1026 loc_id++;
1027 }
1028#elif(VKFFT_BACKEND==1)
1029 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1030 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1031#elif(VKFFT_BACKEND==2)
1032 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1033 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1034#elif(VKFFT_BACKEND==3)
1035 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1036 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1037#endif
1038 return res;
1039}
1040static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t inputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) {
1042 switch (inputType) {
1043 case 0: case 2: case 3: case 4:case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {//single_c2c + single_c2c_strided
1044 char inputOffset[30] = "";
1045 if (sc->inputOffset > 0)
1046 sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize);
1047 char shiftX[500] = "";
1048 if (sc->inputStride[0] == 1)
1049 sprintf(shiftX, "(%s)", index_x);
1050 else
1051 sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]);
1052 char shiftY[500] = "";
1053 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
1054 if (sc->size[1] > 1) {
1055 if (sc->numAxisUploads == 1) {
1056 if (sc->axisSwapped) {
1057 if (sc->performWorkGroupShift[1])
1058 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]);
1059 else
1060 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]);
1061 }
1062 else {
1063 if (sc->performWorkGroupShift[1])
1064 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]);
1065 else
1066 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]);
1067 }
1068 }
1069 else {
1070 if (sc->performWorkGroupShift[1])
1071 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]);
1072 else
1073 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]);
1074 }
1075 }
1076 char shiftZ[500] = "";
1077 if (sc->size[2] > 1) {
1078 if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
1079 if (sc->performWorkGroupShift[2])
1080 sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
1081 else
1082 sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
1083 }
1084 else {
1085 if (sc->performWorkGroupShift[2])
1086 sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]);
1087 else
1088 sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]);
1089 }
1090 }
1091 char shiftCoordinate[500] = "";
1092 uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
1093 if (sc->numCoordinates * sc->matrixConvolution > 1) {
1094 sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
1095 }
1096 if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
1097 maxCoordinate = 1;
1098 sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]);
1099 }
1100 char shiftBatch[500] = "";
1101 if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
1102 if (sc->convolutionStep && (sc->numKernels > 1)) {
1103 sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]);
1104 }
1105 else
1106 sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]);
1107 }
1108 sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1109 res = VkAppendLine(sc);
1110 if (res != VKFFT_SUCCESS) return res;
1111 break;
1112 }
1113 case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c
1114 char inputOffset[30] = "";
1115 if (sc->inputOffset > 0)
1116 sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize);
1117 char shiftX[500] = "";
1118 if (sc->inputStride[0] == 1)
1119 sprintf(shiftX, "(%s)", index_x);
1120 else
1121 sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]);
1122
1123 char shiftY[500] = "";
1124 if (index_y)
1125 sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->inputStride[1]);
1126
1127 char shiftZ[500] = "";
1128 if (sc->size[2] > 1) {
1129 if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
1130 if (sc->performWorkGroupShift[2])
1131 sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
1132 else
1133 sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
1134 }
1135 else {
1136 if (sc->performWorkGroupShift[2])
1137 sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]);
1138 else
1139 sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]);
1140 }
1141 }
1142 char shiftCoordinate[500] = "";
1143 uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
1144 if (sc->numCoordinates * sc->matrixConvolution > 1) {
1145 sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
1146 }
1147 if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
1148 maxCoordinate = 1;
1149 sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]);
1150 }
1151 char shiftBatch[500] = "";
1152 if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
1153 if (sc->convolutionStep && (sc->numKernels > 1)) {
1154 sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]);
1155 }
1156 else
1157 sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]);
1158 }
1159 sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1160 res = VkAppendLine(sc);
1161 if (res != VKFFT_SUCCESS) return res;
1162 break;
1163 }
1164 }
1165 return res;
1166}
1167static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t outputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) {
1169 switch (outputType) {//single_c2c + single_c2c_strided
1170 case 0: case 2: case 3: case 4: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
1171 char outputOffset[30] = "";
1172 if (sc->outputOffset > 0)
1173 sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize);
1174 char shiftX[500] = "";
1175 if (sc->numAxisUploads == 1)
1176 sprintf(shiftX, "(%s)", index_x);
1177 else
1178 sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]);
1179 char shiftY[500] = "";
1180 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
1181 if (sc->size[1] > 1) {
1182 if (sc->numAxisUploads == 1) {
1183 if (sc->axisSwapped) {
1184 if (sc->performWorkGroupShift[1])
1185 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]);
1186 else
1187 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]);
1188 }
1189 else {
1190 if (sc->performWorkGroupShift[1])
1191 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]);
1192 else
1193 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]);
1194 }
1195 }
1196 else {
1197 if (sc->performWorkGroupShift[1])
1198 sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]);
1199 else
1200 sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]);
1201 }
1202 }
1203 char shiftZ[500] = "";
1204 if (sc->size[2] > 1) {
1205 if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
1206 if (sc->performWorkGroupShift[2])
1207 sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
1208 else
1209 sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
1210 }
1211 else {
1212 if (sc->performWorkGroupShift[2])
1213 sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]);
1214 else
1215 sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]);
1216 }
1217 }
1218 char shiftCoordinate[500] = "";
1219 uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
1220 if (sc->numCoordinates * sc->matrixConvolution > 1) {
1221 sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
1222 }
1223 if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
1224 maxCoordinate = 1;
1225 sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]);
1226 }
1227 char shiftBatch[500] = "";
1228 if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
1229 if (sc->convolutionStep && (sc->numKernels > 1)) {
1230 sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]);
1231 }
1232 else
1233 sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]);
1234 }
1235 sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1236 res = VkAppendLine(sc);
1237 if (res != VKFFT_SUCCESS) return res;
1238 break;
1239 }
1240 case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c
1241 char outputOffset[30] = "";
1242 if (sc->outputOffset > 0)
1243 sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize);
1244 char shiftX[500] = "";
1245 if (sc->numAxisUploads == 1)
1246 sprintf(shiftX, "(%s)", index_x);
1247 else
1248 sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]);
1249 char shiftY[500] = "";
1250 if (index_y)
1251 sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->outputStride[1]);
1252 char shiftZ[500] = "";
1253 if (sc->size[2] > 1) {
1254 if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
1255 if (sc->performWorkGroupShift[2])
1256 sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
1257 else
1258 sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
1259 }
1260 else {
1261 if (sc->performWorkGroupShift[2])
1262 sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]);
1263 else
1264 sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]);
1265 }
1266 }
1267 char shiftCoordinate[500] = "";
1268 uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
1269 if (sc->numCoordinates * sc->matrixConvolution > 1) {
1270 sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
1271 }
1272 if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
1273 maxCoordinate = 1;
1274 sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]);
1275 }
1276 char shiftBatch[500] = "";
1277 if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
1278 if (sc->convolutionStep && (sc->numKernels > 1)) {
1279 sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]);
1280 }
1281 else
1282 sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]);
1283 }
1284 sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1285 res = VkAppendLine(sc);
1286 if (res != VKFFT_SUCCESS) return res;
1287 break;
1288
1289 }
1290 }
1291 return res;
1292}
1293
1294static inline VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t radix, uint64_t stageSize, double stageAngle, char** regID) {
1296 char vecType[30];
1297 char LFending[4] = "";
1298 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
1299#if(VKFFT_BACKEND==0)
1300 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
1301 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
1302 char cosDef[20] = "cos";
1303 char sinDef[20] = "sin";
1304 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
1305#elif(VKFFT_BACKEND==1)
1306 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1307 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1308 char cosDef[20] = "__cosf";
1309 char sinDef[20] = "__sinf";
1310 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
1311#elif(VKFFT_BACKEND==2)
1312 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1313 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1314 char cosDef[20] = "__cosf";
1315 char sinDef[20] = "__sinf";
1316 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
1317#elif(VKFFT_BACKEND==3)
1318 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
1319 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
1320 char cosDef[20] = "native_cos";
1321 char sinDef[20] = "native_sin";
1322 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
1323#endif
1324 char* temp = sc->temp;
1325 //sprintf(temp, "loc_0");
1326 char* w = sc->w;
1327 //sprintf(w, "w");
1328 char* iw = sc->iw;
1329 //sprintf(iw, "iw");
1330 char convolutionInverse[30] = "";
1331 if (sc->convolutionStep) sprintf(convolutionInverse, ", %s inverse", uintType);
1332 switch (radix) {
1333 case 2: {
1334 /*if (sc->LUT) {
1335 sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s LUTId) {\n", vecType, vecType, uintType);
1336 }
1337 else {
1338 sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s angle) {\n", vecType, vecType, floatType);
1339 }*/
1340 /*VkAppendLine(sc, " {\n");
1341 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp);
1342 res = VkAppendLine(sc);
1343if (res != VKFFT_SUCCESS) return res;
1344 sc->tempLen = sprintf(sc->tempStr, " {\n\
1345 %s temp;\n", vecType);*/
1346 if (sc->LUT) {
1347 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
1348 res = VkAppendLine(sc);
1349 if (res != VKFFT_SUCCESS) return res;
1350 if (!sc->inverse) {
1351 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1352 res = VkAppendLine(sc);
1353 if (res != VKFFT_SUCCESS) return res;
1354 }
1355 }
1356 else {
1357 if (!strcmp(floatType, "float")) {
1358 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef);
1359 res = VkAppendLine(sc);
1360 if (res != VKFFT_SUCCESS) return res;
1361 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef);
1362 res = VkAppendLine(sc);
1363 if (res != VKFFT_SUCCESS) return res;
1364 }
1365 if (!strcmp(floatType, "double")) {
1366 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w);
1367 res = VkAppendLine(sc);
1368 if (res != VKFFT_SUCCESS) return res;
1369 }
1370 }
1371 res = VkMulComplex(sc, temp, regID[1], w, 0);
1372 if (res != VKFFT_SUCCESS) return res;
1373 res = VkSubComplex(sc, regID[1], regID[0], temp);
1374 if (res != VKFFT_SUCCESS) return res;
1375 res = VkAddComplex(sc, regID[0], regID[0], temp);
1376 if (res != VKFFT_SUCCESS) return res;
1377 /*VkAppendLine(sc, " }\n");
1378 sc->tempLen = sprintf(sc->tempStr, "\
1379temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
1380temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
1381temp%s = temp%s - temp;\n\
1382temp%s = temp%s + temp;\n\
1383}\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
1384 break;
1385 }
1386 case 3: {
1387 /* if (sc->LUT) {
1388 sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s LUTId) {\n", vecType, vecType, vecType, uintType);
1389 }
1390 else {
1391 sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s angle) {\n", vecType, vecType, vecType, floatType);
1392 }*/
1393 char* tf[2];
1394 //VkAppendLine(sc, " {\n");
1395 for (uint64_t i = 0; i < 2; i++) {
1396 tf[i] = (char*)malloc(sizeof(char) * 50);
1397 if (!tf[i]) {
1398 for (uint64_t j = 0; j < i; j++) {
1399 free(tf[j]);
1400 tf[j] = 0;
1401 }
1403 }
1404 }
1405
1406 sprintf(tf[0], "-0.5%s", LFending);
1407 sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending);
1408
1409 /*for (uint64_t i = 0; i < 3; i++) {
1410 sc->locID[i] = (char*)malloc(sizeof(char) * 50);
1411 sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
1412 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]);
1413 res = VkAppendLine(sc);
1414if (res != VKFFT_SUCCESS) return res;
1415 }*/
1416 if (sc->LUT) {
1417 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
1418 res = VkAppendLine(sc);
1419 if (res != VKFFT_SUCCESS) return res;
1420 if (!sc->inverse) {
1421 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1422 res = VkAppendLine(sc);
1423 if (res != VKFFT_SUCCESS) return res;
1424 }
1425 }
1426 else {
1427 if (!strcmp(floatType, "float")) {
1428 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 4.0 / 3.0, LFending);
1429 res = VkAppendLine(sc);
1430 if (res != VKFFT_SUCCESS) return res;
1431 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 4.0 / 3.0, LFending);
1432 res = VkAppendLine(sc);
1433 if (res != VKFFT_SUCCESS) return res;
1434 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 4.0 / 3.0, 4.0 / 3.0);
1435 }
1436 if (!strcmp(floatType, "double")) {
1437 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 4.0 / 3.0, LFending);
1438 res = VkAppendLine(sc);
1439 if (res != VKFFT_SUCCESS) return res;
1440 }
1441 }
1442 res = VkMulComplex(sc, sc->locID[2], regID[2], w, 0);
1443 /*sc->tempLen = sprintf(sc->tempStr, "\
1444loc_2.x = temp%s.x * w.x - temp%s.y * w.y;\n\
1445loc_2.y = temp%s.y * w.x + temp%s.x * w.y;\n", regID[2], regID[2], regID[2], regID[2]);*/
1446 if (sc->LUT) {
1447 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize);
1448 res = VkAppendLine(sc);
1449 if (res != VKFFT_SUCCESS) return res;
1450 if (!sc->inverse) {
1451 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1452 res = VkAppendLine(sc);
1453 if (res != VKFFT_SUCCESS) return res;
1454 }
1455 }
1456 else {
1457 if (!strcmp(floatType, "float")) {
1458 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 / 3.0, LFending);
1459 res = VkAppendLine(sc);
1460 if (res != VKFFT_SUCCESS) return res;
1461 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 / 3.0, LFending);
1462 res = VkAppendLine(sc);
1463 if (res != VKFFT_SUCCESS) return res;
1464 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 / 3.0, 2.0 / 3.0);
1465 }
1466 if (!strcmp(floatType, "double")) {
1467 sc->tempLen = sprintf(sc->tempStr, " %s=sincos_20(angle*%.17f%s);\n", w, 2.0 / 3.0, LFending);
1468 res = VkAppendLine(sc);
1469 if (res != VKFFT_SUCCESS) return res;
1470 }
1471 }
1472 res = VkMulComplex(sc, sc->locID[1], regID[1], w, 0);
1473 if (res != VKFFT_SUCCESS) return res;
1474 /*sc->tempLen = sprintf(sc->tempStr, "\
1475loc_1.x = temp%s.x * w.x - temp%s.y * w.y;\n\
1476loc_1.y = temp%s.y * w.x + temp%s.x * w.y;\n", regID[1], regID[1], regID[1], regID[1]);*/
1477 res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[2]);
1478 if (res != VKFFT_SUCCESS) return res;
1479 res = VkSubComplex(sc, regID[2], sc->locID[1], sc->locID[2]);
1480 if (res != VKFFT_SUCCESS) return res;
1481 /*sc->tempLen = sprintf(sc->tempStr, "\
1482temp%s = loc_1 + loc_2;\n\
1483temp%s = loc_1 - loc_2;\n", regID[1], regID[2]);*/
1484 res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]);
1485 if (res != VKFFT_SUCCESS) return res;
1486 res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]);
1487 if (res != VKFFT_SUCCESS) return res;
1488 res = VkMulComplexNumber(sc, sc->locID[2], regID[2], tf[1]);
1489 if (res != VKFFT_SUCCESS) return res;
1490 res = VkMovComplex(sc, regID[0], sc->locID[0]);
1491 if (res != VKFFT_SUCCESS) return res;
1492 /*sc->tempLen = sprintf(sc->tempStr, "\
1493loc_0 = temp%s + temp%s;\n\
1494loc_1 = temp%s - 0.5 * temp%s;\n\
1495loc_2 = -0.8660254037844386467637231707529 * temp%s;\n\
1496temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[1], regID[2], regID[0]);*/
1497
1498 if (stageAngle < 0)
1499 {
1500 res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[2], 0);
1501 if (res != VKFFT_SUCCESS) return res;
1502 res = VkShuffleComplexInv(sc, regID[2], sc->locID[1], sc->locID[2], 0);
1503 if (res != VKFFT_SUCCESS) return res;
1504 /*sc->tempLen = sprintf(sc->tempStr, "\
1505temp%s.x = loc_1.x - loc_2.y; \n\
1506temp%s.y = loc_1.y + loc_2.x; \n\
1507temp%s.x = loc_1.x + loc_2.y; \n\
1508temp%s.y = loc_1.y - loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/
1509 }
1510 else {
1511 res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[2], 0);
1512 if (res != VKFFT_SUCCESS) return res;
1513 res = VkShuffleComplex(sc, regID[2], sc->locID[1], sc->locID[2], 0);
1514 if (res != VKFFT_SUCCESS) return res;
1515 /*sc->tempLen = sprintf(sc->tempStr, "\
1516temp%s.x = loc_1.x + loc_2.y; \n\
1517temp%s.y = loc_1.y - loc_2.x; \n\
1518temp%s.x = loc_1.x - loc_2.y; \n\
1519temp%s.y = loc_1.y + loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/
1520 }
1521
1522 //VkAppendLine(sc, " }\n");
1523 for (uint64_t i = 0; i < 2; i++) {
1524 free(tf[i]);
1525 tf[i] = 0;
1526 //free(sc->locID[i]);
1527 }
1528 //free(sc->locID[2]);
1529 break;
1530 }
1531 case 4: {
1532 /*if (sc->LUT)
1533 sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, uintType, convolutionInverse);
1534 else
1535 sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s angle%s) {\n", vecType, vecType, vecType, vecType, floatType, convolutionInverse);
1536 */
1537 //VkAppendLine(sc, " {\n");
1538 //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp);
1539 //res = VkAppendLine(sc);
1540 if (res != VKFFT_SUCCESS) return res;
1541 if (sc->LUT) {
1542 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
1543 res = VkAppendLine(sc);
1544 if (res != VKFFT_SUCCESS) return res;
1545 if (!sc->inverse) {
1546 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1547 res = VkAppendLine(sc);
1548 if (res != VKFFT_SUCCESS) return res;
1549 }
1550 }
1551 else {
1552 if (!strcmp(floatType, "float")) {
1553 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef);
1554 res = VkAppendLine(sc);
1555 if (res != VKFFT_SUCCESS) return res;
1556 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef);
1557 res = VkAppendLine(sc);
1558 if (res != VKFFT_SUCCESS) return res;
1559 }
1560 if (!strcmp(floatType, "double")) {
1561 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w);
1562 res = VkAppendLine(sc);
1563 if (res != VKFFT_SUCCESS) return res;
1564 }
1565 }
1566 res = VkMulComplex(sc, temp, regID[2], w, 0);
1567 if (res != VKFFT_SUCCESS) return res;
1568 res = VkSubComplex(sc, regID[2], regID[0], temp);
1569 if (res != VKFFT_SUCCESS) return res;
1570 res = VkAddComplex(sc, regID[0], regID[0], temp);
1571 if (res != VKFFT_SUCCESS) return res;
1572 res = VkMulComplex(sc, temp, regID[3], w, 0);
1573 if (res != VKFFT_SUCCESS) return res;
1574 res = VkSubComplex(sc, regID[3], regID[1], temp);
1575 if (res != VKFFT_SUCCESS) return res;
1576 res = VkAddComplex(sc, regID[1], regID[1], temp);
1577 if (res != VKFFT_SUCCESS) return res;
1578 /*sc->tempLen = sprintf(sc->tempStr, "\
1579temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
1580temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
1581temp%s = temp%s - temp;\n\
1582temp%s = temp%s + temp;\n\n\
1583temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
1584temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
1585temp%s = temp%s - temp;\n\
1586temp%s = temp%s + temp;\n\n\
1587//DIF 2nd stage with angle\n", regID[2], regID[2], regID[2], regID[2], regID[2], regID[0], regID[0], regID[0], regID[3], regID[3], regID[3], regID[3], regID[3], regID[1], regID[1], regID[1]);*/
1588 if (sc->LUT) {
1589 sc->tempLen = sprintf(sc->tempStr, " %s=twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize);
1590 res = VkAppendLine(sc);
1591 if (res != VKFFT_SUCCESS) return res;
1592 if (!sc->inverse) {
1593 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1594 res = VkAppendLine(sc);
1595 if (res != VKFFT_SUCCESS) return res;
1596 }
1597 }
1598 else {
1599 if (!strcmp(floatType, "float")) {
1600 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
1601 res = VkAppendLine(sc);
1602 if (res != VKFFT_SUCCESS) return res;
1603 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
1604 res = VkAppendLine(sc);
1605 if (res != VKFFT_SUCCESS) return res;
1606 }
1607 if (!strcmp(floatType, "double")) {
1608 sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
1609 res = VkAppendLine(sc);
1610 if (res != VKFFT_SUCCESS) return res;
1611 }
1612 }
1613 res = VkMulComplex(sc, temp, regID[1], w, 0);
1614 if (res != VKFFT_SUCCESS) return res;
1615 res = VkSubComplex(sc, regID[1], regID[0], temp);
1616 if (res != VKFFT_SUCCESS) return res;
1617 res = VkAddComplex(sc, regID[0], regID[0], temp);
1618 if (res != VKFFT_SUCCESS) return res;
1619 /*sc->tempLen = sprintf(sc->tempStr, "\
1620temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
1621temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
1622temp%s = temp%s - temp;\n\
1623temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
1624 if (stageAngle < 0) {
1625 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w);
1626 res = VkAppendLine(sc);
1627 if (res != VKFFT_SUCCESS) return res;
1628 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, w);
1629 res = VkAppendLine(sc);
1630 if (res != VKFFT_SUCCESS) return res;
1631 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, temp);
1632 res = VkAppendLine(sc);
1633 if (res != VKFFT_SUCCESS) return res;
1634 //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType);
1635 }
1636 else {
1637 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w);
1638 res = VkAppendLine(sc);
1639 if (res != VKFFT_SUCCESS) return res;
1640 sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, w);
1641 res = VkAppendLine(sc);
1642 if (res != VKFFT_SUCCESS) return res;
1643 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, temp);
1644 res = VkAppendLine(sc);
1645 if (res != VKFFT_SUCCESS) return res;
1646 //sc->tempLen = sprintf(sc->tempStr, " w = %s(-w.y, w.x);\n\n", vecType);
1647 }
1648 res = VkMulComplex(sc, temp, regID[3], w, 0);
1649 if (res != VKFFT_SUCCESS) return res;
1650 res = VkSubComplex(sc, regID[3], regID[2], temp);
1651 if (res != VKFFT_SUCCESS) return res;
1652 res = VkAddComplex(sc, regID[2], regID[2], temp);
1653 if (res != VKFFT_SUCCESS) return res;
1654 res = VkMovComplex(sc, temp, regID[1]);
1655 if (res != VKFFT_SUCCESS) return res;
1656 res = VkMovComplex(sc, regID[1], regID[2]);
1657 if (res != VKFFT_SUCCESS) return res;
1658 res = VkMovComplex(sc, regID[2], temp);
1659 if (res != VKFFT_SUCCESS) return res;
1660 /*VkAppendLine(sc, " }\n");
1661 sc->tempLen = sprintf(sc->tempStr, "\
1662temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
1663temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
1664temp%s = temp%s - temp;\n\
1665temp%s = temp%s + temp;\n\n\
1666temp = temp%s;\n\
1667temp%s = temp%s;\n\
1668temp%s = temp;\n\
1669}\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2], regID[1], regID[1], regID[2], regID[2]);*/
1670 break;
1671 }
1672 case 5: {
1673 /*if (sc->LUT) {
1674 sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType);
1675 }
1676 else {
1677 sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
1678 }*/
1679 char* tf[5];
1680 //VkAppendLine(sc, " {\n");
1681 for (uint64_t i = 0; i < 5; i++) {
1682 tf[i] = (char*)malloc(sizeof(char) * 50);
1683 if (!tf[i]) {
1684 for (uint64_t j = 0; j < i; j++) {
1685 free(tf[j]);
1686 tf[j] = 0;
1687 }
1689 }
1690 }
1691 sprintf(tf[0], "-0.5%s", LFending);
1692 sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending);
1693 sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending);
1694 sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending);
1695 sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending);
1696
1697 /*for (uint64_t i = 0; i < 5; i++) {
1698 sc->locID[i] = (char*)malloc(sizeof(char) * 50);
1699 sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
1700 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]);
1701 res = VkAppendLine(sc);
1702if (res != VKFFT_SUCCESS) return res;
1703 }*/
1704 /*sc->tempLen = sprintf(sc->tempStr, " {\n\
1705 %s loc_0;\n %s loc_1;\n %s loc_2;\n %s loc_3;\n %s loc_4;\n", vecType, vecType, vecType, vecType, vecType);*/
1706 for (uint64_t i = radix - 1; i > 0; i--) {
1707 if (i == radix - 1) {
1708 if (sc->LUT) {
1709 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
1710 res = VkAppendLine(sc);
1711 if (res != VKFFT_SUCCESS) return res;
1712 if (!sc->inverse) {
1713 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1714 res = VkAppendLine(sc);
1715 if (res != VKFFT_SUCCESS) return res;
1716 }
1717 }
1718 else {
1719 if (!strcmp(floatType, "float")) {
1720 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1721 res = VkAppendLine(sc);
1722 if (res != VKFFT_SUCCESS) return res;
1723 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1724 res = VkAppendLine(sc);
1725 if (res != VKFFT_SUCCESS) return res;
1726 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
1727 }
1728 if (!strcmp(floatType, "double")) {
1729 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1730 res = VkAppendLine(sc);
1731 if (res != VKFFT_SUCCESS) return res;
1732 }
1733 }
1734 }
1735 else {
1736 if (sc->LUT) {
1737 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n", w, (radix - 1 - i) * stageSize);
1738 res = VkAppendLine(sc);
1739 if (res != VKFFT_SUCCESS) return res;
1740 if (!sc->inverse) {
1741 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1742 res = VkAppendLine(sc);
1743 if (res != VKFFT_SUCCESS) return res;
1744 }
1745 }
1746 else {
1747 if (!strcmp(floatType, "float")) {
1748 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1749 res = VkAppendLine(sc);
1750 if (res != VKFFT_SUCCESS) return res;
1751 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1752 res = VkAppendLine(sc);
1753 if (res != VKFFT_SUCCESS) return res;
1754 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
1755 }
1756 if (!strcmp(floatType, "double")) {
1757 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1758 res = VkAppendLine(sc);
1759 if (res != VKFFT_SUCCESS) return res;
1760 }
1761 }
1762 }
1763 res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
1764 if (res != VKFFT_SUCCESS) return res;
1765 /*sc->tempLen = sprintf(sc->tempStr, "\
1766loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\
1767loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/
1768 }
1769 res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[4]);
1770 if (res != VKFFT_SUCCESS) return res;
1771 res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[3]);
1772 if (res != VKFFT_SUCCESS) return res;
1773 res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[3]);
1774 if (res != VKFFT_SUCCESS) return res;
1775 res = VkSubComplex(sc, regID[4], sc->locID[1], sc->locID[4]);
1776 if (res != VKFFT_SUCCESS) return res;
1777 res = VkSubComplex(sc, sc->locID[3], regID[1], regID[2]);
1778 if (res != VKFFT_SUCCESS) return res;
1779 res = VkAddComplex(sc, sc->locID[4], regID[3], regID[4]);
1780 if (res != VKFFT_SUCCESS) return res;
1781 /*sc->tempLen = sprintf(sc->tempStr, "\
1782temp%s = loc_1 + loc_4;\n\
1783temp%s = loc_2 + loc_3;\n\
1784temp%s = loc_2 - loc_3;\n\
1785temp%s = loc_1 - loc_4;\n\
1786loc_3 = temp%s - temp%s;\n\
1787loc_4 = temp%s + temp%s;\n", regID[1], regID[2], regID[3], regID[4], regID[1], regID[2], regID[3], regID[4]);*/
1788 res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]);
1789 if (res != VKFFT_SUCCESS) return res;
1790 res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[2]);
1791 if (res != VKFFT_SUCCESS) return res;
1792 res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]);
1793 if (res != VKFFT_SUCCESS) return res;
1794 res = VkFMAComplex(sc, sc->locID[2], regID[2], tf[0], regID[0]);
1795 if (res != VKFFT_SUCCESS) return res;
1796 res = VkMulComplexNumber(sc, regID[3], regID[3], tf[1]);
1797 if (res != VKFFT_SUCCESS) return res;
1798 res = VkMulComplexNumber(sc, regID[4], regID[4], tf[2]);
1799 if (res != VKFFT_SUCCESS) return res;
1800 res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]);
1801 if (res != VKFFT_SUCCESS) return res;
1802 res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]);
1803 if (res != VKFFT_SUCCESS) return res;
1804 /*sc->tempLen = sprintf(sc->tempStr, "\
1805loc_0 = temp%s + temp%s + temp%s;\n\
1806loc_1 = temp%s - 0.5 * temp%s;\n\
1807loc_2 = temp%s - 0.5 * temp%s;\n\
1808temp%s *= 1.538841768587626701285145288018455;\n\
1809temp%s *= -0.363271264002680442947733378740309;\n\
1810loc_3 *= -0.809016994374947424102293417182819;\n\
1811loc_4 *= -0.587785252292473129168705954639073;\n", regID[0], regID[1], regID[2], regID[0], regID[1], regID[0], regID[2], regID[3], regID[4]);*/
1812 res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]);
1813 if (res != VKFFT_SUCCESS) return res;
1814 res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]);
1815 if (res != VKFFT_SUCCESS) return res;
1816 res = VkAddComplex(sc, sc->locID[3], regID[3], sc->locID[4]);
1817 if (res != VKFFT_SUCCESS) return res;
1818 res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[4]);
1819 if (res != VKFFT_SUCCESS) return res;
1820 res = VkMovComplex(sc, regID[0], sc->locID[0]);
1821 if (res != VKFFT_SUCCESS) return res;
1822 /*sc->tempLen = sprintf(sc->tempStr, "\
1823loc_1 -= loc_3;\n\
1824loc_2 += loc_3;\n\
1825loc_3 = temp%s+loc_4;\n\
1826loc_4 += temp%s;\n\
1827temp%s = loc_0;\n", regID[3], regID[4], regID[0]);*/
1828
1829 if (stageAngle < 0)
1830 {
1831 res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[4], 0);
1832 if (res != VKFFT_SUCCESS) return res;
1833 res = VkShuffleComplex(sc, regID[2], sc->locID[2], sc->locID[3], 0);
1834 if (res != VKFFT_SUCCESS) return res;
1835 res = VkShuffleComplexInv(sc, regID[3], sc->locID[2], sc->locID[3], 0);
1836 if (res != VKFFT_SUCCESS) return res;
1837 res = VkShuffleComplexInv(sc, regID[4], sc->locID[1], sc->locID[4], 0);
1838 if (res != VKFFT_SUCCESS) return res;
1839 /*sc->tempLen = sprintf(sc->tempStr, "\
1840temp%s.x = loc_1.x - loc_4.y; \n\
1841temp%s.y = loc_1.y + loc_4.x; \n\
1842temp%s.x = loc_2.x - loc_3.y; \n\
1843temp%s.y = loc_2.y + loc_3.x; \n\
1844temp%s.x = loc_2.x + loc_3.y; \n\
1845temp%s.y = loc_2.y - loc_3.x; \n\
1846temp%s.x = loc_1.x + loc_4.y; \n\
1847temp%s.y = loc_1.y - loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/
1848 }
1849 else {
1850 res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0);
1851 if (res != VKFFT_SUCCESS) return res;
1852 res = VkShuffleComplexInv(sc, regID[2], sc->locID[2], sc->locID[3], 0);
1853 if (res != VKFFT_SUCCESS) return res;
1854 res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[3], 0);
1855 if (res != VKFFT_SUCCESS) return res;
1856 res = VkShuffleComplex(sc, regID[4], sc->locID[1], sc->locID[4], 0);
1857 if (res != VKFFT_SUCCESS) return res;
1858 /*sc->tempLen = sprintf(sc->tempStr, "\
1859temp%s.x = loc_1.x + loc_4.y; \n\
1860temp%s.y = loc_1.y - loc_4.x; \n\
1861temp%s.x = loc_2.x + loc_3.y; \n\
1862temp%s.y = loc_2.y - loc_3.x; \n\
1863temp%s.x = loc_2.x - loc_3.y; \n\
1864temp%s.y = loc_2.y + loc_3.x; \n\
1865temp%s.x = loc_1.x - loc_4.y; \n\
1866temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/
1867 }
1868
1869 //VkAppendLine(sc, " }\n");
1870 for (uint64_t i = 0; i < 5; i++) {
1871 free(tf[i]);
1872 tf[i] = 0;
1873 //free(sc->locID[i]);
1874 }
1875 break;
1876 }
1877 case 7: {
1878 /*if (sc->LUT) {
1879 sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType);
1880 }
1881 else {
1882 sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
1883 }*/
1884 char* tf[8];
1885
1886 //VkAppendLine(sc, " {\n");
1887 for (uint64_t i = 0; i < 8; i++) {
1888 tf[i] = (char*)malloc(sizeof(char) * 50);
1889 if (!tf[i]) {
1890 for (uint64_t j = 0; j < i; j++) {
1891 free(tf[j]);
1892 tf[j] = 0;
1893 }
1895 }
1896 }
1897 sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending);
1898 sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending);
1899 sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending);
1900 sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending);
1901 if (stageAngle < 0) {
1902 sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending);
1903 sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending);
1904 sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending);
1905 sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending);
1906 }
1907 else {
1908 sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending);
1909 sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending);
1910 sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending);
1911 sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending);
1912 }
1913 /*for (uint64_t i = 0; i < 7; i++) {
1914 sc->locID[i] = (char*)malloc(sizeof(char) * 50);
1915 sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
1916 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]);
1917 res = VkAppendLine(sc);
1918if (res != VKFFT_SUCCESS) return res;
1919 }*/
1920 for (uint64_t i = radix - 1; i > 0; i--) {
1921 if (i == radix - 1) {
1922 if (sc->LUT) {
1923 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
1924 res = VkAppendLine(sc);
1925 if (res != VKFFT_SUCCESS) return res;
1926 if (!sc->inverse) {
1927 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1928 res = VkAppendLine(sc);
1929 if (res != VKFFT_SUCCESS) return res;
1930 }
1931 }
1932 else {
1933 if (!strcmp(floatType, "float")) {
1934 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1935 res = VkAppendLine(sc);
1936 if (res != VKFFT_SUCCESS) return res;
1937 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1938 res = VkAppendLine(sc);
1939 if (res != VKFFT_SUCCESS) return res;
1940 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
1941 }
1942 if (!strcmp(floatType, "double")) {
1943 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1944 res = VkAppendLine(sc);
1945 if (res != VKFFT_SUCCESS) return res;
1946 }
1947 }
1948 }
1949 else {
1950 if (sc->LUT) {
1951 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
1952 res = VkAppendLine(sc);
1953 if (res != VKFFT_SUCCESS) return res;
1954 if (!sc->inverse) {
1955 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
1956 res = VkAppendLine(sc);
1957 if (res != VKFFT_SUCCESS) return res;
1958 }
1959 }
1960 else {
1961 if (!strcmp(floatType, "float")) {
1962 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1963 res = VkAppendLine(sc);
1964 if (res != VKFFT_SUCCESS) return res;
1965 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1966 res = VkAppendLine(sc);
1967 if (res != VKFFT_SUCCESS) return res;
1968 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
1969 }
1970 if (!strcmp(floatType, "double")) {
1971 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1972 res = VkAppendLine(sc);
1973 if (res != VKFFT_SUCCESS) return res;
1974 }
1975 }
1976 }
1977 res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
1978 if (res != VKFFT_SUCCESS) return res;
1979 /*sc->tempLen = sprintf(sc->tempStr, "\
1980loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\
1981loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/
1982 }
1983 res = VkMovComplex(sc, sc->locID[0], regID[0]);
1984 if (res != VKFFT_SUCCESS) return res;
1985 res = VkAddComplex(sc, regID[0], sc->locID[1], sc->locID[6]);
1986 if (res != VKFFT_SUCCESS) return res;
1987 res = VkSubComplex(sc, regID[1], sc->locID[1], sc->locID[6]);
1988 if (res != VKFFT_SUCCESS) return res;
1989 res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[5]);
1990 if (res != VKFFT_SUCCESS) return res;
1991 res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[5]);
1992 if (res != VKFFT_SUCCESS) return res;
1993 res = VkAddComplex(sc, regID[4], sc->locID[4], sc->locID[3]);
1994 if (res != VKFFT_SUCCESS) return res;
1995 res = VkSubComplex(sc, regID[5], sc->locID[4], sc->locID[3]);
1996 if (res != VKFFT_SUCCESS) return res;
1997 /*sc->tempLen = sprintf(sc->tempStr, "\
1998loc_0 = temp%s;\n\
1999temp%s = loc_1 + loc_6;\n\
2000temp%s = loc_1 - loc_6;\n\
2001temp%s = loc_2 + loc_5;\n\
2002temp%s = loc_2 - loc_5;\n\
2003temp%s = loc_4 + loc_3;\n\
2004temp%s = loc_4 - loc_3;\n", regID[0], regID[0], regID[1], regID[2], regID[3], regID[4], regID[5]);*/
2005 res = VkAddComplex(sc, sc->locID[5], regID[1], regID[3]);
2006 if (res != VKFFT_SUCCESS) return res;
2007 res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]);
2008 if (res != VKFFT_SUCCESS) return res;
2009 res = VkAddComplex(sc, sc->locID[1], regID[0], regID[2]);
2010 if (res != VKFFT_SUCCESS) return res;
2011 res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[4]);
2012 if (res != VKFFT_SUCCESS) return res;
2013 res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]);
2014 if (res != VKFFT_SUCCESS) return res;
2015 /*sc->tempLen = sprintf(sc->tempStr, "\
2016loc_5 = temp%s + temp%s + temp%s;\n\
2017loc_1 = temp%s + temp%s + temp%s;\n\
2018loc_0 += loc_1;\n", regID[1], regID[3], regID[5], regID[0], regID[2], regID[4]);*/
2019 res = VkSubComplex(sc, sc->locID[2], regID[0], regID[4]);
2020 if (res != VKFFT_SUCCESS) return res;
2021 res = VkSubComplex(sc, sc->locID[3], regID[4], regID[2]);
2022 if (res != VKFFT_SUCCESS) return res;
2023 res = VkSubComplex(sc, sc->locID[4], regID[2], regID[0]);
2024 if (res != VKFFT_SUCCESS) return res;
2025 /*sc->tempLen = sprintf(sc->tempStr, "\
2026loc_2 = temp%s - temp%s;\n\
2027loc_3 = temp%s - temp%s;\n\
2028loc_4 = temp%s - temp%s;\n", regID[0], regID[4], regID[4], regID[2], regID[2], regID[0]);*/
2029 res = VkSubComplex(sc, regID[0], regID[1], regID[5]);
2030 if (res != VKFFT_SUCCESS) return res;
2031 res = VkSubComplex(sc, regID[2], regID[5], regID[3]);
2032 if (res != VKFFT_SUCCESS) return res;
2033 res = VkSubComplex(sc, regID[4], regID[3], regID[1]);
2034 if (res != VKFFT_SUCCESS) return res;
2035 /*sc->tempLen = sprintf(sc->tempStr, "\
2036temp%s = temp%s - temp%s;\n\
2037temp%s = temp%s - temp%s;\n\
2038temp%s = temp%s - temp%s;\n", regID[0], regID[1], regID[5], regID[2], regID[5], regID[3], regID[4], regID[3], regID[1]);*/
2039
2040 res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]);
2041 if (res != VKFFT_SUCCESS) return res;
2042 res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]);
2043 if (res != VKFFT_SUCCESS) return res;
2044 res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]);
2045 if (res != VKFFT_SUCCESS) return res;
2046 res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]);
2047 if (res != VKFFT_SUCCESS) return res;
2048 res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]);
2049 if (res != VKFFT_SUCCESS) return res;
2050 res = VkMulComplexNumber(sc, regID[0], regID[0], tf[5]);
2051 if (res != VKFFT_SUCCESS) return res;
2052 res = VkMulComplexNumber(sc, regID[2], regID[2], tf[6]);
2053 if (res != VKFFT_SUCCESS) return res;
2054 res = VkMulComplexNumber(sc, regID[4], regID[4], tf[7]);
2055 if (res != VKFFT_SUCCESS) return res;
2056 /*sc->tempLen = sprintf(sc->tempStr, "\
2057loc_1 *= -1.16666666666666651863693004997913;\n\
2058loc_2 *= 0.79015646852540022404554065360571;\n\
2059loc_3 *= 0.05585426728964774240049351305970;\n\
2060loc_4 *= 0.73430220123575240531721419756650;\n\
2061loc_5 *= 0.44095855184409837868031445395900;\n\
2062temp%s *= 0.34087293062393136944265847887436;\n\
2063temp%s *= -0.53396936033772524066165487965918;\n\
2064temp%s *= 0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);*/
2065
2066 res = VkSubComplex(sc, regID[5], regID[4], regID[2]);
2067 if (res != VKFFT_SUCCESS) return res;
2068 res = VkAddComplexInv(sc, regID[6], regID[4], regID[0]);
2069 if (res != VKFFT_SUCCESS) return res;
2070 res = VkAddComplex(sc, regID[4], regID[0], regID[2]);
2071 if (res != VKFFT_SUCCESS) return res;
2072 /*sc->tempLen = sprintf(sc->tempStr, "\
2073temp%s = temp%s - temp%s;\n\
2074temp%s = - temp%s - temp%s;\n\
2075temp%s = temp%s + temp%s;\n", regID[5], regID[4], regID[2], regID[6], regID[4], regID[0], regID[4], regID[0], regID[2]);*/
2076 res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
2077 if (res != VKFFT_SUCCESS) return res;
2078 res = VkAddComplex(sc, regID[1], sc->locID[2], sc->locID[3]);
2079 if (res != VKFFT_SUCCESS) return res;
2080 res = VkSubComplex(sc, regID[2], sc->locID[4], sc->locID[3]);
2081 if (res != VKFFT_SUCCESS) return res;
2082 res = VkAddComplexInv(sc, regID[3], sc->locID[2], sc->locID[4]);
2083 if (res != VKFFT_SUCCESS) return res;
2084 /*sc->tempLen = sprintf(sc->tempStr, "\
2085temp%s = loc_0 + loc_1;\n\
2086temp%s = loc_2 + loc_3;\n\
2087temp%s = loc_4 - loc_3;\n\
2088temp%s = - loc_2 - loc_4;\n", regID[0], regID[1], regID[2], regID[3]);*/
2089 res = VkAddComplex(sc, sc->locID[1], regID[0], regID[1]);
2090 if (res != VKFFT_SUCCESS) return res;
2091 res = VkAddComplex(sc, sc->locID[2], regID[0], regID[2]);
2092 if (res != VKFFT_SUCCESS) return res;
2093 res = VkAddComplex(sc, sc->locID[3], regID[0], regID[3]);
2094 if (res != VKFFT_SUCCESS) return res;
2095 res = VkAddComplex(sc, sc->locID[4], regID[4], sc->locID[5]);
2096 if (res != VKFFT_SUCCESS) return res;
2097 res = VkAddComplex(sc, sc->locID[6], regID[6], sc->locID[5]);
2098 if (res != VKFFT_SUCCESS) return res;
2099 res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]);
2100 if (res != VKFFT_SUCCESS) return res;
2101 res = VkMovComplex(sc, regID[0], sc->locID[0]);
2102 if (res != VKFFT_SUCCESS) return res;
2103 /*sc->tempLen = sprintf(sc->tempStr, "\
2104loc_1 = temp%s + temp%s;\n\
2105loc_2 = temp%s + temp%s;\n\
2106loc_3 = temp%s + temp%s;\n\
2107loc_4 = temp%s + loc_5;\n\
2108loc_6 = temp%s + loc_5;\n\
2109loc_5 += temp%s;\n\
2110temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[2], regID[0], regID[3], regID[4], regID[6], regID[5], regID[0]);*/
2111 res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0);
2112 if (res != VKFFT_SUCCESS) return res;
2113 res = VkShuffleComplexInv(sc, regID[2], sc->locID[3], sc->locID[6], 0);
2114 if (res != VKFFT_SUCCESS) return res;
2115 res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[5], 0);
2116 if (res != VKFFT_SUCCESS) return res;
2117 res = VkShuffleComplexInv(sc, regID[4], sc->locID[2], sc->locID[5], 0);
2118 if (res != VKFFT_SUCCESS) return res;
2119 res = VkShuffleComplex(sc, regID[5], sc->locID[3], sc->locID[6], 0);
2120 if (res != VKFFT_SUCCESS) return res;
2121 res = VkShuffleComplex(sc, regID[6], sc->locID[1], sc->locID[4], 0);
2122 if (res != VKFFT_SUCCESS) return res;
2123
2124 /*sc->tempLen = sprintf(sc->tempStr, "\
2125temp%s.x = loc_1.x + loc_4.y; \n\
2126temp%s.y = loc_1.y - loc_4.x; \n\
2127temp%s.x = loc_3.x + loc_6.y; \n\
2128temp%s.y = loc_3.y - loc_6.x; \n\
2129temp%s.x = loc_2.x - loc_5.y; \n\
2130temp%s.y = loc_2.y + loc_5.x; \n\
2131temp%s.x = loc_2.x + loc_5.y; \n\
2132temp%s.y = loc_2.y - loc_5.x; \n\
2133temp%s.x = loc_3.x - loc_6.y; \n\
2134temp%s.y = loc_3.y + loc_6.x; \n\
2135temp%s.x = loc_1.x - loc_4.y; \n\
2136temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4], regID[5], regID[5], regID[6], regID[6]);
2137 VkAppendLine(sc, " }\n");*/
2138 /*for (uint64_t i = 0; i < 7; i++) {
2139 free(sc->locID[i]);
2140 }*/
2141 for (uint64_t i = 0; i < 8; i++) {
2142 free(tf[i]);
2143 tf[i] = 0;
2144 }
2145 break;
2146 }
2147 case 8: {
2148 /*if (sc->LUT)
2149 sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, uintType, convolutionInverse);
2150 else
2151 sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s angle%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, floatType, convolutionInverse);
2152 */
2153 //VkAppendLine(sc, " {\n");
2154 /*sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp);
2155 res = VkAppendLine(sc);
2156if (res != VKFFT_SUCCESS) return res;
2157 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, iw);
2158 res = VkAppendLine(sc);
2159if (res != VKFFT_SUCCESS) return res;*/
2160 if (sc->LUT) {
2161 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
2162 res = VkAppendLine(sc);
2163 if (res != VKFFT_SUCCESS) return res;
2164 if (!sc->inverse) {
2165 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2166 res = VkAppendLine(sc);
2167 if (res != VKFFT_SUCCESS) return res;
2168 }
2169 }
2170 else {
2171 if (!strcmp(floatType, "float")) {
2172 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef);
2173 res = VkAppendLine(sc);
2174 if (res != VKFFT_SUCCESS) return res;
2175 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef);
2176 res = VkAppendLine(sc);
2177 if (res != VKFFT_SUCCESS) return res;
2178 }
2179 if (!strcmp(floatType, "double")) {
2180 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w);
2181 res = VkAppendLine(sc);
2182 if (res != VKFFT_SUCCESS) return res;
2183 }
2184 }
2185 for (uint64_t i = 0; i < 4; i++) {
2186 res = VkMulComplex(sc, temp, regID[i + 4], w, 0);
2187 if (res != VKFFT_SUCCESS) return res;
2188 res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
2189 if (res != VKFFT_SUCCESS) return res;
2190 res = VkAddComplex(sc, regID[i], regID[i], temp);
2191 if (res != VKFFT_SUCCESS) return res;
2192 /*sc->tempLen = sprintf(sc->tempStr, "\
2193temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
2194temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
2195temp%s = temp%s - temp;\n\
2196temp%s = temp%s + temp;\n\n", regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 0], regID[i + 0], regID[i + 0]);*/
2197 }
2198 if (sc->LUT) {
2199 sc->tempLen = sprintf(sc->tempStr, " %s=twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize);
2200 res = VkAppendLine(sc);
2201 if (res != VKFFT_SUCCESS) return res;
2202 if (!sc->inverse) {
2203 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2204 res = VkAppendLine(sc);
2205 if (res != VKFFT_SUCCESS) return res;
2206 }
2207 }
2208 else {
2209 if (!strcmp(floatType, "float")) {
2210 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
2211 res = VkAppendLine(sc);
2212 if (res != VKFFT_SUCCESS) return res;
2213 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
2214 res = VkAppendLine(sc);
2215 if (res != VKFFT_SUCCESS) return res;
2216 }
2217 if (!strcmp(floatType, "double")) {
2218 sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
2219 res = VkAppendLine(sc);
2220 if (res != VKFFT_SUCCESS) return res;
2221 }
2222 }
2223 for (uint64_t i = 0; i < 2; i++) {
2224 res = VkMulComplex(sc, temp, regID[i + 2], w, 0);
2225 if (res != VKFFT_SUCCESS) return res;
2226 res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
2227 if (res != VKFFT_SUCCESS) return res;
2228 res = VkAddComplex(sc, regID[i], regID[i], temp);
2229 if (res != VKFFT_SUCCESS) return res;
2230 /*sc->tempLen = sprintf(sc->tempStr, "\
2231temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
2232temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
2233temp%s = temp%s - temp;\n\
2234temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/
2235 }
2236 if (stageAngle < 0) {
2237 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w);
2238 res = VkAppendLine(sc);
2239 if (res != VKFFT_SUCCESS) return res;
2240 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w);
2241 res = VkAppendLine(sc);
2242 if (res != VKFFT_SUCCESS) return res;
2243 //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType);
2244 }
2245 else {
2246 sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w);
2247 res = VkAppendLine(sc);
2248 if (res != VKFFT_SUCCESS) return res;
2249 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w);
2250 res = VkAppendLine(sc);
2251 if (res != VKFFT_SUCCESS) return res;
2252 //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType);
2253 }
2254
2255 for (uint64_t i = 4; i < 6; i++) {
2256 res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
2257 if (res != VKFFT_SUCCESS) return res;
2258 res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
2259 if (res != VKFFT_SUCCESS) return res;
2260 res = VkAddComplex(sc, regID[i], regID[i], temp);
2261 if (res != VKFFT_SUCCESS) return res;
2262 /*sc->tempLen = sprintf(sc->tempStr, "\
2263temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
2264temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
2265temp%s = temp%s - temp;\n\
2266temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/
2267 }
2268
2269 if (sc->LUT) {
2270 sc->tempLen = sprintf(sc->tempStr, " %s=twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize);
2271 res = VkAppendLine(sc);
2272 if (res != VKFFT_SUCCESS) return res;
2273 if (!sc->inverse) {
2274 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2275 res = VkAppendLine(sc);
2276 if (res != VKFFT_SUCCESS) return res;
2277 }
2278 }
2279 else {
2280 if (!strcmp(floatType, "float")) {
2281 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
2282 res = VkAppendLine(sc);
2283 if (res != VKFFT_SUCCESS) return res;
2284 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
2285 res = VkAppendLine(sc);
2286 if (res != VKFFT_SUCCESS) return res;
2287 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
2288 }
2289 if (!strcmp(floatType, "double")) {
2290 sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
2291 res = VkAppendLine(sc);
2292 if (res != VKFFT_SUCCESS) return res;
2293 }
2294 }
2295 res = VkMulComplex(sc, temp, regID[1], w, 0);
2296 if (res != VKFFT_SUCCESS) return res;
2297 res = VkSubComplex(sc, regID[1], regID[0], temp);
2298 if (res != VKFFT_SUCCESS) return res;
2299 res = VkAddComplex(sc, regID[0], regID[0], temp);
2300 if (res != VKFFT_SUCCESS) return res;
2301 /*sc->tempLen = sprintf(sc->tempStr, "\
2302temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
2303temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
2304temp%s = temp%s - temp;\n\
2305temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
2306 if (stageAngle < 0) {
2307 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w);
2308 res = VkAppendLine(sc);
2309 if (res != VKFFT_SUCCESS) return res;
2310 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w);
2311 res = VkAppendLine(sc);
2312 if (res != VKFFT_SUCCESS) return res;
2313 //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType);
2314 }
2315 else {
2316 sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w);
2317 res = VkAppendLine(sc);
2318 if (res != VKFFT_SUCCESS) return res;
2319 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w);
2320 res = VkAppendLine(sc);
2321 if (res != VKFFT_SUCCESS) return res;
2322 //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType);
2323 }
2324 res = VkMulComplex(sc, temp, regID[3], iw, 0);
2325 if (res != VKFFT_SUCCESS) return res;
2326 res = VkSubComplex(sc, regID[3], regID[2], temp);
2327 if (res != VKFFT_SUCCESS) return res;
2328 res = VkAddComplex(sc, regID[2], regID[2], temp);
2329 if (res != VKFFT_SUCCESS) return res;
2330 /*sc->tempLen = sprintf(sc->tempStr, "\
2331temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
2332temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
2333temp%s = temp%s - temp;\n\
2334temp%s = temp%s + temp;\n\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2]);*/
2335 if (stageAngle < 0) {
2336 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
2337 res = VkAppendLine(sc);
2338 if (res != VKFFT_SUCCESS) return res;
2339 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
2340 res = VkAppendLine(sc);
2341 if (res != VKFFT_SUCCESS) return res;
2342 }
2343 else {
2344 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
2345 res = VkAppendLine(sc);
2346 if (res != VKFFT_SUCCESS) return res;
2347 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
2348 res = VkAppendLine(sc);
2349 if (res != VKFFT_SUCCESS) return res;
2350 }
2351 res = VkMulComplex(sc, temp, regID[5], iw, 0);
2352 if (res != VKFFT_SUCCESS) return res;
2353 res = VkSubComplex(sc, regID[5], regID[4], temp);
2354 if (res != VKFFT_SUCCESS) return res;
2355 res = VkAddComplex(sc, regID[4], regID[4], temp);
2356 if (res != VKFFT_SUCCESS) return res;
2357 /*sc->tempLen = sprintf(sc->tempStr, "\
2358temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
2359temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
2360temp%s = temp%s - temp;\n\
2361temp%s = temp%s + temp;\n\n", regID[5], regID[5], regID[5], regID[5], regID[5], regID[4], regID[4], regID[4]);*/
2362 if (stageAngle < 0) {
2363 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw);
2364 res = VkAppendLine(sc);
2365 if (res != VKFFT_SUCCESS) return res;
2366 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw);
2367 res = VkAppendLine(sc);
2368 if (res != VKFFT_SUCCESS) return res;
2369 //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType);
2370 }
2371 else {
2372 sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw);
2373 res = VkAppendLine(sc);
2374 if (res != VKFFT_SUCCESS) return res;
2375 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw);
2376 res = VkAppendLine(sc);
2377 if (res != VKFFT_SUCCESS) return res;
2378 //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType);
2379 }
2380 res = VkMulComplex(sc, temp, regID[7], w, 0);
2381 if (res != VKFFT_SUCCESS) return res;
2382 res = VkSubComplex(sc, regID[7], regID[6], temp);
2383 if (res != VKFFT_SUCCESS) return res;
2384 res = VkAddComplex(sc, regID[6], regID[6], temp);
2385 if (res != VKFFT_SUCCESS) return res;
2386 res = VkMovComplex(sc, temp, regID[1]);
2387 if (res != VKFFT_SUCCESS) return res;
2388 res = VkMovComplex(sc, regID[1], regID[4]);
2389 if (res != VKFFT_SUCCESS) return res;
2390 res = VkMovComplex(sc, regID[4], temp);
2391 if (res != VKFFT_SUCCESS) return res;
2392 res = VkMovComplex(sc, temp, regID[3]);
2393 if (res != VKFFT_SUCCESS) return res;
2394 res = VkMovComplex(sc, regID[3], regID[6]);
2395 if (res != VKFFT_SUCCESS) return res;
2396 res = VkMovComplex(sc, regID[6], temp);
2397 if (res != VKFFT_SUCCESS) return res;
2398 /*sc->tempLen = sprintf(sc->tempStr, "\
2399temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
2400temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
2401temp%s = temp%s - temp;\n\
2402temp%s = temp%s + temp;\n\n\
2403temp = temp%s;\n\
2404temp%s = temp%s;\n\
2405temp%s = temp;\n\n\
2406temp = temp%s;\n\
2407temp%s = temp%s;\n\
2408temp%s = temp;\n\
2409}\n\n", regID[7], regID[7], regID[7], regID[7], regID[7], regID[6], regID[6], regID[6], regID[1], regID[1], regID[4], regID[4], regID[3], regID[3], regID[6], regID[6]);
2410 //VkAppendLine(sc, " }\n");*/
2411
2412 break;
2413 }
2414 case 11: {
2415
2416 char* tf[20];
2417 //char* tf2[4];
2418 //char* tf2inv[4];
2419 //VkAppendLine(sc, " {\n");
2420 for (uint64_t i = 0; i < 20; i++) {
2421 tf[i] = (char*)malloc(sizeof(char) * 50);
2422 if (!tf[i]) {
2423 for (uint64_t j = 0; j < i; j++) {
2424 free(tf[j]);
2425 tf[j] = 0;
2426 }
2428 }
2429 //tf2[i] = (char*)malloc(sizeof(char) * 50);
2430 //tf2inv[i] = (char*)malloc(sizeof(char) * 50);
2431 }
2432 sprintf(tf[0], "-1.100000000000000%s", LFending);
2433
2434 sprintf(tf[2], "0.253097611605959%s", LFending);
2435 sprintf(tf[3], "-1.288200610773679%s", LFending);
2436 sprintf(tf[4], "0.304632239669212%s", LFending);
2437 sprintf(tf[5], "-0.391339615511917%s", LFending);
2438 sprintf(tf[6], "-2.871022253392850%s", LFending);
2439 sprintf(tf[7], "1.374907986616384%s", LFending);
2440 sprintf(tf[8], "0.817178135341212%s", LFending);
2441 sprintf(tf[9], "1.800746506445679%s", LFending);
2442 sprintf(tf[10], "-0.859492973614498%s", LFending);
2443
2444 if (stageAngle < 0) {
2445 sprintf(tf[1], "0.331662479035540%s", LFending);
2446 sprintf(tf[11], "-2.373470454748280%s", LFending);
2447 sprintf(tf[12], "-0.024836393087493%s", LFending);
2448 sprintf(tf[13], "0.474017017512829%s", LFending);
2449 sprintf(tf[14], "0.742183927770612%s", LFending);
2450 sprintf(tf[15], "1.406473309094609%s", LFending);
2451 sprintf(tf[16], "-1.191364552195948%s", LFending);
2452 sprintf(tf[17], "0.708088885039503%s", LFending);
2453 sprintf(tf[18], "0.258908260614168%s", LFending);
2454 sprintf(tf[19], "-0.049929922194110%s", LFending);
2455 }
2456 else {
2457 sprintf(tf[1], "-0.331662479035540%s", LFending);
2458 sprintf(tf[11], "2.373470454748280%s", LFending);
2459 sprintf(tf[12], "0.024836393087493%s", LFending);
2460 sprintf(tf[13], "-0.474017017512829%s", LFending);
2461 sprintf(tf[14], "-0.742183927770612%s", LFending);
2462 sprintf(tf[15], "-1.406473309094609%s", LFending);
2463 sprintf(tf[16], "1.191364552195948%s", LFending);
2464 sprintf(tf[17], "-0.708088885039503%s", LFending);
2465 sprintf(tf[18], "-0.258908260614168%s", LFending);
2466 sprintf(tf[19], "0.049929922194110%s", LFending);
2467 }
2468 for (uint64_t i = radix - 1; i > 0; i--) {
2469 if (i == radix - 1) {
2470 if (sc->LUT) {
2471 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
2472 res = VkAppendLine(sc);
2473 if (res != VKFFT_SUCCESS) return res;
2474 if (!sc->inverse) {
2475 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2476 res = VkAppendLine(sc);
2477 if (res != VKFFT_SUCCESS) return res;
2478 }
2479 }
2480 else {
2481 if (!strcmp(floatType, "float")) {
2482 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2483 res = VkAppendLine(sc);
2484 if (res != VKFFT_SUCCESS) return res;
2485 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2486 res = VkAppendLine(sc);
2487 if (res != VKFFT_SUCCESS) return res;
2488 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
2489 }
2490 if (!strcmp(floatType, "double")) {
2491 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2492 res = VkAppendLine(sc);
2493 if (res != VKFFT_SUCCESS) return res;
2494 }
2495 }
2496 }
2497 else {
2498 if (sc->LUT) {
2499 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
2500 res = VkAppendLine(sc);
2501 if (res != VKFFT_SUCCESS) return res;
2502 if (!sc->inverse) {
2503 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2504 res = VkAppendLine(sc);
2505 if (res != VKFFT_SUCCESS) return res;
2506 }
2507 }
2508 else {
2509 if (!strcmp(floatType, "float")) {
2510 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2511 res = VkAppendLine(sc);
2512 if (res != VKFFT_SUCCESS) return res;
2513 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2514 res = VkAppendLine(sc);
2515 if (res != VKFFT_SUCCESS) return res;
2516 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
2517 }
2518 if (!strcmp(floatType, "double")) {
2519 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2520 res = VkAppendLine(sc);
2521 if (res != VKFFT_SUCCESS) return res;
2522 }
2523 }
2524 }
2525 res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
2526 if (res != VKFFT_SUCCESS) return res;
2527 }
2528 res = VkMovComplex(sc, sc->locID[0], regID[0]);
2529 if (res != VKFFT_SUCCESS) return res;
2530 uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
2531 res = VkPermute(sc, permute, 11, 0, 0);
2532 if (res != VKFFT_SUCCESS) return res;
2533 for (uint64_t i = 0; i < 5; i++) {
2534 res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
2535 if (res != VKFFT_SUCCESS) return res;
2536 res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
2537 if (res != VKFFT_SUCCESS) return res;
2538 }
2539 res = VkMovComplex(sc, sc->locID[1], regID[1]);
2540 if (res != VKFFT_SUCCESS) return res;
2541 for (uint64_t i = 0; i < 4; i++) {
2542 res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 2]);
2543 if (res != VKFFT_SUCCESS) return res;
2544 res = VkSubComplex(sc, sc->locID[i + 3], regID[i + 1], regID[5]);
2545 if (res != VKFFT_SUCCESS) return res;
2546 }
2547 res = VkMovComplex(sc, sc->locID[2], regID[6]);
2548 if (res != VKFFT_SUCCESS) return res;
2549 for (uint64_t i = 0; i < 4; i++) {
2550 res = VkAddComplex(sc, sc->locID[2], sc->locID[2], regID[i + 7]);
2551 if (res != VKFFT_SUCCESS) return res;
2552 res = VkSubComplex(sc, sc->locID[i + 7], regID[i + 6], regID[10]);
2553 if (res != VKFFT_SUCCESS) return res;
2554 }
2555
2556 res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
2557 if (res != VKFFT_SUCCESS) return res;
2558 res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
2559 if (res != VKFFT_SUCCESS) return res;
2560 res = VkMulComplexNumberImag(sc, regID[2], sc->locID[2], tf[1], sc->locID[0]);
2561 if (res != VKFFT_SUCCESS) return res;
2562 for (uint64_t k = 0; k < 2; k++) {
2563 res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], sc->locID[k * 4 + 5]);
2564 if (res != VKFFT_SUCCESS) return res;
2565 res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], sc->locID[k * 4 + 6]);
2566 if (res != VKFFT_SUCCESS) return res;
2567 res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 3], sc->locID[k * 4 + 4]);
2568 if (res != VKFFT_SUCCESS) return res;
2569 res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 5], sc->locID[k * 4 + 6]);
2570 if (res != VKFFT_SUCCESS) return res;
2571 res = VkAddComplex(sc, sc->locID[1], regID[k * 4 + 3], regID[k * 4 + 4]);
2572 if (res != VKFFT_SUCCESS) return res;
2573
2574 if (k == 0) {
2575 res = VkMulComplexNumber(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2]);
2576 if (res != VKFFT_SUCCESS) return res;
2577 res = VkMulComplexNumber(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3]);
2578 if (res != VKFFT_SUCCESS) return res;
2579 res = VkMulComplexNumber(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4]);
2580 if (res != VKFFT_SUCCESS) return res;
2581 res = VkMulComplexNumber(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5]);
2582 if (res != VKFFT_SUCCESS) return res;
2583 res = VkMulComplexNumber(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6]);
2584 if (res != VKFFT_SUCCESS) return res;
2585 res = VkMulComplexNumber(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7]);
2586 if (res != VKFFT_SUCCESS) return res;
2587 res = VkMulComplexNumber(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8]);
2588 if (res != VKFFT_SUCCESS) return res;
2589 res = VkMulComplexNumber(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9]);
2590 if (res != VKFFT_SUCCESS) return res;
2591 res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10]);
2592 if (res != VKFFT_SUCCESS) return res;
2593 }
2594 else {
2595 res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2], sc->locID[0]);
2596 if (res != VKFFT_SUCCESS) return res;
2597 res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3], sc->locID[0]);
2598 if (res != VKFFT_SUCCESS) return res;
2599 res = VkMulComplexNumberImag(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4], sc->locID[0]);
2600 if (res != VKFFT_SUCCESS) return res;
2601 res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5], sc->locID[0]);
2602 if (res != VKFFT_SUCCESS) return res;
2603 res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6], sc->locID[0]);
2604 if (res != VKFFT_SUCCESS) return res;
2605 res = VkMulComplexNumberImag(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7], sc->locID[0]);
2606 if (res != VKFFT_SUCCESS) return res;
2607 res = VkMulComplexNumberImag(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8], sc->locID[0]);
2608 if (res != VKFFT_SUCCESS) return res;
2609 res = VkMulComplexNumberImag(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9], sc->locID[0]);
2610 if (res != VKFFT_SUCCESS) return res;
2611 res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10], sc->locID[0]);
2612 if (res != VKFFT_SUCCESS) return res;
2613 }
2614
2615 res = VkAddComplex(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 3]);
2616 if (res != VKFFT_SUCCESS) return res;
2617 res = VkAddComplex(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 3]);
2618 if (res != VKFFT_SUCCESS) return res;
2619
2620 res = VkAddComplex(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 4]);
2621 if (res != VKFFT_SUCCESS) return res;
2622 res = VkAddComplex(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 4]);
2623 if (res != VKFFT_SUCCESS) return res;
2624
2625 res = VkAddComplex(sc, regID[k * 4 + 5], regID[k * 4 + 5], sc->locID[1]);
2626 if (res != VKFFT_SUCCESS) return res;
2627 res = VkAddComplex(sc, regID[k * 4 + 6], regID[k * 4 + 6], sc->locID[1]);
2628 if (res != VKFFT_SUCCESS) return res;
2629
2630 res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 5]);
2631 if (res != VKFFT_SUCCESS) return res;
2632 res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 5]);
2633 if (res != VKFFT_SUCCESS) return res;
2634
2635 res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 6]);
2636 if (res != VKFFT_SUCCESS) return res;
2637 res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 6]);
2638 if (res != VKFFT_SUCCESS) return res;
2639
2640 }
2641 res = VkAddComplex(sc, regID[1], regID[0], regID[1]);
2642 if (res != VKFFT_SUCCESS) return res;
2643
2644 res = VkMovComplex(sc, sc->locID[5], regID[1]);
2645 if (res != VKFFT_SUCCESS) return res;
2646 for (uint64_t i = 0; i < 4; i++) {
2647 res = VkAddComplex(sc, sc->locID[i + 1], regID[1], regID[i + 3]);
2648 if (res != VKFFT_SUCCESS) return res;
2649 res = VkSubComplex(sc, sc->locID[5], sc->locID[5], regID[i + 3]);
2650 if (res != VKFFT_SUCCESS) return res;
2651 }
2652 res = VkMovComplex(sc, sc->locID[10], regID[2]);
2653 if (res != VKFFT_SUCCESS) return res;
2654 for (uint64_t i = 0; i < 4; i++) {
2655 res = VkAddComplex(sc, sc->locID[i + 6], regID[2], regID[i + 7]);
2656 if (res != VKFFT_SUCCESS) return res;
2657 res = VkSubComplex(sc, sc->locID[10], sc->locID[10], regID[i + 7]);
2658 if (res != VKFFT_SUCCESS) return res;
2659 }
2660 for (uint64_t i = 0; i < 5; i++) {
2661 res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
2662 if (res != VKFFT_SUCCESS) return res;
2663 res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
2664 if (res != VKFFT_SUCCESS) return res;
2665 }
2666 uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
2667 res = VkPermute(sc, permute2, 11, 1, regID);
2668 if (res != VKFFT_SUCCESS) return res;
2669
2670 for (uint64_t i = 0; i < 20; i++) {
2671 free(tf[i]);
2672 tf[i] = 0;
2673 }
2674 break;
2675 }
2676 case 13: {
2677
2678 char* tf[20];
2679 //char* tf2[4];
2680 //char* tf2inv[4];
2681 //VkAppendLine(sc, " {\n");
2682 for (uint64_t i = 0; i < 20; i++) {
2683 tf[i] = (char*)malloc(sizeof(char) * 50);
2684 if (!tf[i]) {
2685 for (uint64_t j = 0; j < i; j++) {
2686 free(tf[j]);
2687 tf[j] = 0;
2688 }
2690 }
2691 //tf2[i] = (char*)malloc(sizeof(char) * 50);
2692 //tf2inv[i] = (char*)malloc(sizeof(char) * 50);
2693 }
2694 sprintf(tf[0], "-1.083333333333333%s", LFending);
2695 sprintf(tf[1], "-0.300462606288666%s", LFending);
2696 sprintf(tf[5], "1.007074065727533%s", LFending);
2697 sprintf(tf[6], "0.731245990975348%s", LFending);
2698 sprintf(tf[7], "-0.579440018900960%s", LFending);
2699 sprintf(tf[8], "0.531932498429674%s", LFending);
2700 sprintf(tf[9], "-0.508814921720398%s", LFending);
2701 sprintf(tf[10], "-0.007705858903092%s", LFending);
2702
2703 if (stageAngle < 0) {
2704 sprintf(tf[2], "-0.749279330626139%s", LFending);
2705 sprintf(tf[3], "0.401002128321867%s", LFending);
2706 sprintf(tf[4], "0.174138601152136%s", LFending);
2707 sprintf(tf[11], "-2.511393318389568%s", LFending);
2708 sprintf(tf[12], "-1.823546408682421%s", LFending);
2709 sprintf(tf[13], "1.444979909023996%s", LFending);
2710 sprintf(tf[14], "-1.344056915177370%s", LFending);
2711 sprintf(tf[15], "-0.975932420775946%s", LFending);
2712 sprintf(tf[16], "0.773329778651105%s", LFending);
2713 sprintf(tf[17], "1.927725116783469%s", LFending);
2714 sprintf(tf[18], "1.399739414729183%s", LFending);
2715 sprintf(tf[19], "-1.109154843837551%s", LFending);
2716 }
2717 else {
2718 sprintf(tf[2], "0.749279330626139%s", LFending);
2719 sprintf(tf[3], "-0.401002128321867%s", LFending);
2720 sprintf(tf[4], "-0.174138601152136%s", LFending);
2721 sprintf(tf[11], "2.511393318389568%s", LFending);
2722 sprintf(tf[12], "1.823546408682421%s", LFending);
2723 sprintf(tf[13], "-1.444979909023996%s", LFending);
2724 sprintf(tf[14], "1.344056915177370%s", LFending);
2725 sprintf(tf[15], "0.975932420775946%s", LFending);
2726 sprintf(tf[16], "-0.773329778651105%s", LFending);
2727 sprintf(tf[17], "-1.927725116783469%s", LFending);
2728 sprintf(tf[18], "-1.399739414729183%s", LFending);
2729 sprintf(tf[19], "1.109154843837551%s", LFending);
2730 }
2731 for (uint64_t i = radix - 1; i > 0; i--) {
2732 if (i == radix - 1) {
2733 if (sc->LUT) {
2734 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w);
2735 res = VkAppendLine(sc);
2736 if (res != VKFFT_SUCCESS) return res;
2737 if (!sc->inverse) {
2738 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2739 res = VkAppendLine(sc);
2740 if (res != VKFFT_SUCCESS) return res;
2741 }
2742 }
2743 else {
2744 if (!strcmp(floatType, "float")) {
2745 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2746 res = VkAppendLine(sc);
2747 if (res != VKFFT_SUCCESS) return res;
2748 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2749 res = VkAppendLine(sc);
2750 if (res != VKFFT_SUCCESS) return res;
2751 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
2752 }
2753 if (!strcmp(floatType, "double")) {
2754 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2755 res = VkAppendLine(sc);
2756 if (res != VKFFT_SUCCESS) return res;
2757 }
2758 }
2759 }
2760 else {
2761 if (sc->LUT) {
2762 sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
2763 res = VkAppendLine(sc);
2764 if (res != VKFFT_SUCCESS) return res;
2765 if (!sc->inverse) {
2766 sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w);
2767 res = VkAppendLine(sc);
2768 if (res != VKFFT_SUCCESS) return res;
2769 }
2770 }
2771 else {
2772 if (!strcmp(floatType, "float")) {
2773 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2774 res = VkAppendLine(sc);
2775 if (res != VKFFT_SUCCESS) return res;
2776 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2777 res = VkAppendLine(sc);
2778 if (res != VKFFT_SUCCESS) return res;
2779 //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
2780 }
2781 if (!strcmp(floatType, "double")) {
2782 sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2783 res = VkAppendLine(sc);
2784 if (res != VKFFT_SUCCESS) return res;
2785 }
2786 }
2787 }
2788 res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
2789 if (res != VKFFT_SUCCESS) return res;
2790
2791 }
2792 res = VkMovComplex(sc, sc->locID[0], regID[0]);
2793 if (res != VKFFT_SUCCESS) return res;
2794 uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
2795 res = VkPermute(sc, permute, 13, 0, 0);
2796 if (res != VKFFT_SUCCESS) return res;
2797 for (uint64_t i = 0; i < 6; i++) {
2798 res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
2799 if (res != VKFFT_SUCCESS) return res;
2800 res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
2801 if (res != VKFFT_SUCCESS) return res;
2802 }
2803 for (uint64_t i = 0; i < 3; i++) {
2804 res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
2805 if (res != VKFFT_SUCCESS) return res;
2806 res = VkSubComplex(sc, regID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
2807 if (res != VKFFT_SUCCESS) return res;
2808 }
2809 for (uint64_t i = 0; i < 4; i++) {
2810 res = VkAddComplex(sc, sc->locID[i + 1], regID[i * 3 + 1], regID[i * 3 + 2]);
2811 if (res != VKFFT_SUCCESS) return res;
2812 res = VkSubComplex(sc, sc->locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
2813 if (res != VKFFT_SUCCESS) return res;
2814 res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], regID[i * 3 + 3]);
2815 if (res != VKFFT_SUCCESS) return res;
2816 res = VkSubComplex(sc, sc->locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
2817 if (res != VKFFT_SUCCESS) return res;
2818 }
2819
2820 res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
2821 if (res != VKFFT_SUCCESS) return res;
2822 res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
2823 if (res != VKFFT_SUCCESS) return res;
2824 res = VkMulComplexNumber(sc, regID[2], sc->locID[2], tf[1]);
2825 if (res != VKFFT_SUCCESS) return res;
2826 for (uint64_t k = 0; k < 3; k++) {
2827 res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 3], sc->locID[k * 2 + 4]);
2828
2829 if (k == 0) {
2830 res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2], sc->locID[0]);
2831 if (res != VKFFT_SUCCESS) return res;
2832 res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3], sc->locID[0]);
2833 if (res != VKFFT_SUCCESS) return res;
2834 res = VkMulComplexNumberImag(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4], sc->locID[0]);
2835 if (res != VKFFT_SUCCESS) return res;
2836 }
2837 else {
2838 res = VkMulComplexNumber(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2]);
2839 if (res != VKFFT_SUCCESS) return res;
2840 res = VkMulComplexNumber(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3]);
2841 if (res != VKFFT_SUCCESS) return res;
2842 res = VkMulComplexNumber(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4]);
2843 if (res != VKFFT_SUCCESS) return res;
2844 }
2845
2846 res = VkAddComplex(sc, regID[k * 2 + 3], sc->locID[k * 2 + 3], regID[k * 2 + 4]);
2847 if (res != VKFFT_SUCCESS) return res;
2848 res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 4], regID[k * 2 + 4]);
2849 if (res != VKFFT_SUCCESS) return res;
2850
2851 }
2852 res = VkAddComplex(sc, regID[9], sc->locID[9], sc->locID[11]);
2853 if (res != VKFFT_SUCCESS) return res;
2854 res = VkAddComplex(sc, regID[10], sc->locID[10], sc->locID[12]);
2855 if (res != VKFFT_SUCCESS) return res;
2856 res = VkAddComplex(sc, regID[11], sc->locID[9], sc->locID[10]);
2857 if (res != VKFFT_SUCCESS) return res;
2858 res = VkAddComplex(sc, regID[12], sc->locID[11], sc->locID[12]);
2859 if (res != VKFFT_SUCCESS) return res;
2860 res = VkAddComplex(sc, sc->locID[1], regID[9], regID[10]);
2861 if (res != VKFFT_SUCCESS) return res;
2862
2863 res = VkMulComplexNumberImag(sc, sc->locID[9], sc->locID[9], tf[11], sc->locID[0]);
2864 if (res != VKFFT_SUCCESS) return res;
2865 res = VkMulComplexNumberImag(sc, sc->locID[10], sc->locID[10], tf[12], sc->locID[0]);
2866 if (res != VKFFT_SUCCESS) return res;
2867 res = VkMulComplexNumberImag(sc, regID[11], regID[11], tf[13], sc->locID[0]);
2868 if (res != VKFFT_SUCCESS) return res;
2869 res = VkMulComplexNumberImag(sc, sc->locID[11], sc->locID[11], tf[14], sc->locID[0]);
2870 if (res != VKFFT_SUCCESS) return res;
2871 res = VkMulComplexNumberImag(sc, sc->locID[12], sc->locID[12], tf[15], sc->locID[0]);
2872 if (res != VKFFT_SUCCESS) return res;
2873 res = VkMulComplexNumberImag(sc, regID[12], regID[12], tf[16], sc->locID[0]);
2874 if (res != VKFFT_SUCCESS) return res;
2875 res = VkMulComplexNumberImag(sc, regID[9], regID[9], tf[17], sc->locID[0]);
2876 if (res != VKFFT_SUCCESS) return res;
2877 res = VkMulComplexNumberImag(sc, regID[10], regID[10], tf[18], sc->locID[0]);
2878 if (res != VKFFT_SUCCESS) return res;
2879 res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[19], sc->locID[0]);
2880 if (res != VKFFT_SUCCESS) return res;
2881
2882 res = VkAddComplex(sc, sc->locID[9], sc->locID[9], regID[9]);
2883 if (res != VKFFT_SUCCESS) return res;
2884 res = VkAddComplex(sc, sc->locID[11], sc->locID[11], regID[9]);
2885 if (res != VKFFT_SUCCESS) return res;
2886 res = VkAddComplex(sc, sc->locID[10], sc->locID[10], regID[10]);
2887 if (res != VKFFT_SUCCESS) return res;
2888 res = VkAddComplex(sc, sc->locID[12], sc->locID[12], regID[10]);
2889 if (res != VKFFT_SUCCESS) return res;
2890 res = VkAddComplex(sc, regID[11], regID[11], sc->locID[1]);
2891 if (res != VKFFT_SUCCESS) return res;
2892 res = VkAddComplex(sc, regID[12], regID[12], sc->locID[1]);
2893 if (res != VKFFT_SUCCESS) return res;
2894
2895 res = VkAddComplex(sc, regID[9], sc->locID[9], regID[11]);
2896 if (res != VKFFT_SUCCESS) return res;
2897 res = VkAddComplex(sc, regID[10], sc->locID[10], regID[11]);
2898 if (res != VKFFT_SUCCESS) return res;
2899 res = VkAddComplex(sc, regID[11], sc->locID[11], regID[12]);
2900 if (res != VKFFT_SUCCESS) return res;
2901 res = VkAddComplex(sc, regID[12], sc->locID[12], regID[12]);
2902 if (res != VKFFT_SUCCESS) return res;
2903
2904 res = VkAddComplex(sc, regID[1], regID[0], regID[1]);
2905 if (res != VKFFT_SUCCESS) return res;
2906
2907 for (uint64_t i = 0; i < 4; i++) {
2908 res = VkAddComplex(sc, sc->locID[i * 3 + 1], regID[i + 1], regID[i * 2 + 5]);
2909 if (res != VKFFT_SUCCESS) return res;
2910 res = VkSubComplex(sc, sc->locID[i * 3 + 3], regID[i + 1], regID[i * 2 + 5]);
2911 if (res != VKFFT_SUCCESS) return res;
2912 res = VkAddComplex(sc, sc->locID[i * 3 + 2], regID[i + 1], regID[i * 2 + 6]);
2913 if (res != VKFFT_SUCCESS) return res;
2914 res = VkSubComplex(sc, sc->locID[i * 3 + 3], sc->locID[i * 3 + 3], regID[i * 2 + 6]);
2915 if (res != VKFFT_SUCCESS) return res;
2916 }
2917 for (uint64_t i = 0; i < 3; i++) {
2918 res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
2919 if (res != VKFFT_SUCCESS) return res;
2920 res = VkSubComplex(sc, sc->locID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
2921 if (res != VKFFT_SUCCESS) return res;
2922 res = VkMovComplex(sc, sc->locID[i + 1], regID[i + 1]);
2923 if (res != VKFFT_SUCCESS) return res;
2924 }
2925 for (uint64_t i = 0; i < 6; i++) {
2926 res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
2927 if (res != VKFFT_SUCCESS) return res;
2928 res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
2929 if (res != VKFFT_SUCCESS) return res;
2930 }
2931 uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 };
2932 res = VkPermute(sc, permute2, 13, 1, regID);
2933 if (res != VKFFT_SUCCESS) return res;
2934
2935 for (uint64_t i = 0; i < 20; i++) {
2936 free(tf[i]);
2937 tf[i] = 0;
2938 }
2939 break;
2940 }
2941 }
2942 return res;
2943}
2944static inline VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t sharedType) {
2946 char vecType[30];
2947 char sharedDefinitions[20] = "";
2948 uint64_t vecSize = 1;
2949 uint64_t maxSequenceSharedMemory = 0;
2950 //uint64_t maxSequenceSharedMemoryPow2 = 0;
2951 if (!strcmp(floatType, "float"))
2952 {
2953#if(VKFFT_BACKEND==0)
2954 sprintf(vecType, "vec2");
2955 sprintf(sharedDefinitions, "shared");
2956#elif(VKFFT_BACKEND==1)
2957 sprintf(vecType, "float2");
2958 sprintf(sharedDefinitions, "__shared__");
2959#elif(VKFFT_BACKEND==2)
2960 sprintf(vecType, "float2");
2961 sprintf(sharedDefinitions, "__shared__");
2962#elif(VKFFT_BACKEND==3)
2963 sprintf(vecType, "float2");
2964 sprintf(sharedDefinitions, "__local");
2965#endif
2966 vecSize = 8;
2967 }
2968 if (!strcmp(floatType, "double")) {
2969#if(VKFFT_BACKEND==0)
2970 sprintf(vecType, "dvec2");
2971 sprintf(sharedDefinitions, "shared");
2972#elif(VKFFT_BACKEND==1)
2973 sprintf(vecType, "double2");
2974 sprintf(sharedDefinitions, "__shared__");
2975#elif(VKFFT_BACKEND==2)
2976 sprintf(vecType, "double2");
2977 sprintf(sharedDefinitions, "__shared__");
2978#elif(VKFFT_BACKEND==3)
2979 sprintf(vecType, "double2");
2980 sprintf(sharedDefinitions, "__local");
2981#endif
2982 vecSize = 16;
2983 }
2984 maxSequenceSharedMemory = sc->sharedMemSize / vecSize;
2985 //maxSequenceSharedMemoryPow2 = sc->sharedMemSizePow2 / vecSize;
2986 uint64_t mergeR2C = (sc->mergeSequencesR2C && (sc->axis_id == 0)) ? 2 : 0;
2987 switch (sharedType) {
2988 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c + single_r2c
2989 {
2991 sc->sharedStrideBankConflictFirstStages = ((sc->fftDim > sc->numSharedBanks / 2) && ((sc->fftDim & (sc->fftDim - 1)) == 0)) ? sc->fftDim / sc->registerBoost * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim / sc->registerBoost;
2992 sc->sharedStrideReadWriteConflict = ((sc->numSharedBanks / 2 <= sc->localSize[1])) ? sc->fftDim / sc->registerBoost + 1 : sc->fftDim / sc->registerBoost + (sc->numSharedBanks / 2) / sc->localSize[1];
2993 if (sc->sharedStrideReadWriteConflict < sc->fftDim / sc->registerBoost + mergeR2C) sc->sharedStrideReadWriteConflict = sc->fftDim / sc->registerBoost + mergeR2C;
2995 sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride;
2997
3000 //sc->maxSharedStride += mergeR2C;
3001 //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", sc->maxSharedStride, sc->sharedStrideBankConflictFirstStages, sc->sharedStrideReadWriteConflict, sc->localSize[1], sc->fftDim);
3002 sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->sharedStrideReadWriteConflict);
3003 res = VkAppendLine(sc);
3004 if (res != VKFFT_SUCCESS) return res;
3005#if(VKFFT_BACKEND==0)
3006 sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
3007 res = VkAppendLine(sc);
3008 if (res != VKFFT_SUCCESS) return res;
3009#elif(VKFFT_BACKEND==1)
3010 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
3011 sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3012 res = VkAppendLine(sc);
3013 if (res != VKFFT_SUCCESS) return res;
3014 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
3015#elif(VKFFT_BACKEND==2)
3016 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
3017 sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3018 res = VkAppendLine(sc);
3019 if (res != VKFFT_SUCCESS) return res;
3020 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
3021#elif(VKFFT_BACKEND==3)
3022 sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
3023 res = VkAppendLine(sc);
3024 if (res != VKFFT_SUCCESS) return res;
3025#endif
3026 sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride;
3027 break;
3028 }
3029 case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c + single_c2c_strided
3030 {
3031 uint64_t shift = (sc->fftDim < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim : 1;
3032 sc->sharedStrideReadWriteConflict = ((sc->axisSwapped) && ((sc->localSize[0] % 4) == 0)) ? sc->localSize[0] + shift : sc->localSize[0];
3033 sc->maxSharedStride = ((maxSequenceSharedMemory < sc->sharedStrideReadWriteConflict* sc->fftDim / sc->registerBoost)) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict;
3035 sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->maxSharedStride);
3036 res = VkAppendLine(sc);
3037 if (res != VKFFT_SUCCESS) return res;
3038#if(VKFFT_BACKEND==0)
3039 sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
3040 res = VkAppendLine(sc);
3041 if (res != VKFFT_SUCCESS) return res;
3042#elif(VKFFT_BACKEND==1)
3043 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
3044 sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3045 res = VkAppendLine(sc);
3046 if (res != VKFFT_SUCCESS) return res;
3047 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
3048#elif(VKFFT_BACKEND==2)
3049 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
3050 sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3051 res = VkAppendLine(sc);
3052 if (res != VKFFT_SUCCESS) return res;
3053 //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
3054#elif(VKFFT_BACKEND==3)
3055 sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
3056 res = VkAppendLine(sc);
3057 if (res != VKFFT_SUCCESS) return res;
3058#endif
3059 sc->usedSharedMemory = vecSize * sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost;
3060 break;
3061 }
3062 }
3063 return res;
3064}
3065static inline VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t initType) {
3067 char vecType[30];
3068#if(VKFFT_BACKEND==0)
3069 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
3070 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
3071#elif(VKFFT_BACKEND==1)
3072 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3073 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3074#elif(VKFFT_BACKEND==2)
3075 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3076 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3077#elif(VKFFT_BACKEND==3)
3078 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3079 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3080#endif
3081 //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.x;\n");
3082 uint64_t logicalStoragePerThread = sc->registers_per_thread * sc->registerBoost;
3083 uint64_t logicalRegistersPerThread = sc->registers_per_thread;
3084 if (sc->convolutionStep) {
3085 for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
3086 sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i);
3087 res = VkAppendLine(sc);
3088 if (res != VKFFT_SUCCESS) return res;
3089 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i);
3090 res = VkAppendLine(sc);
3091 if (res != VKFFT_SUCCESS) return res;
3092 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i);
3093 res = VkAppendLine(sc);
3094 if (res != VKFFT_SUCCESS) return res;
3095 }
3096 for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
3097 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
3098 sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 "_%" PRIu64 ";\n", vecType, i, j);
3099 res = VkAppendLine(sc);
3100 if (res != VKFFT_SUCCESS) return res;
3101 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".x=0;\n", i, j);
3102 res = VkAppendLine(sc);
3103 if (res != VKFFT_SUCCESS) return res;
3104 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".y=0;\n", i, j);
3105 res = VkAppendLine(sc);
3106 if (res != VKFFT_SUCCESS) return res;
3107 }
3108 }
3109 }
3110 else {
3111 for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
3112 sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i);
3113 res = VkAppendLine(sc);
3114 if (res != VKFFT_SUCCESS) return res;
3115 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i);
3116 res = VkAppendLine(sc);
3117 if (res != VKFFT_SUCCESS) return res;
3118 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i);
3119 res = VkAppendLine(sc);
3120 if (res != VKFFT_SUCCESS) return res;
3121 }
3122 }
3123 //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.y;//gl_LocalInvocationID.x/gl_WorkGroupSize.x;\n");
3124 //sc->tempLen = sprintf(sc->tempStr, " dum=dum/gl_LocalInvocationID.x-1;\n");
3125 //sc->tempLen = sprintf(sc->tempStr, " dummy=dummy/gl_LocalInvocationID.x-1;\n");
3126 sc->regIDs = (char**)malloc(sizeof(char*) * logicalStoragePerThread);
3127 if (!sc->regIDs) return VKFFT_ERROR_MALLOC_FAILED;
3128 for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
3129 sc->regIDs[i] = (char*)malloc(sizeof(char) * 50);
3130 if (!sc->regIDs[i]) {
3131 for (uint64_t j = 0; j < i; j++) {
3132 free(sc->regIDs[j]);
3133 sc->regIDs[j] = 0;
3134 }
3135 free(sc->regIDs);
3136 sc->regIDs = 0;
3138 }
3139 if (i < logicalRegistersPerThread)
3140 sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i);
3141 else
3142 sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i);
3143 //sprintf(sc->regIDs[i], "%" PRIu64 "[%" PRIu64 "]", i / logicalRegistersPerThread, i % logicalRegistersPerThread);
3144 //sprintf(sc->regIDs[i], "s[%" PRIu64 "]", i - logicalRegistersPerThread);
3145
3146 }
3147 if (sc->registerBoost > 1) {
3148 //sc->tempLen = sprintf(sc->tempStr, " %s sort0;\n", vecType);
3149 //sc->tempLen = sprintf(sc->tempStr, " %s temps[%" PRIu64 "];\n", vecType, (sc->registerBoost -1)* logicalRegistersPerThread);
3150 for (uint64_t i = 1; i < sc->registerBoost; i++) {
3151 //sc->tempLen = sprintf(sc->tempStr, " %s temp%" PRIu64 "[%" PRIu64 "];\n", vecType, i, logicalRegistersPerThread);
3152 for (uint64_t j = 0; j < sc->registers_per_thread; j++) {
3153 sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, j + i * sc->registers_per_thread);
3154 res = VkAppendLine(sc);
3155 if (res != VKFFT_SUCCESS) return res;
3156 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", j + i * sc->registers_per_thread);
3157 res = VkAppendLine(sc);
3158 if (res != VKFFT_SUCCESS) return res;
3159 sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", j + i * sc->registers_per_thread);
3160 res = VkAppendLine(sc);
3161 if (res != VKFFT_SUCCESS) return res;
3162 }
3163 /*sc->tempLen = sprintf(sc->tempStr, "\
3164for(uint i=0; i<%" PRIu64 "; i++)\n\
3165temp%" PRIu64 "[i]=%s(dum, dum);\n", logicalRegistersPerThread, i, vecType);*/
3166 }
3167 }
3168 sc->tempLen = sprintf(sc->tempStr, " %s w;\n", vecType);
3169 res = VkAppendLine(sc);
3170 if (res != VKFFT_SUCCESS) return res;
3171 sc->tempLen = sprintf(sc->tempStr, " w.x=0;\n");
3172 res = VkAppendLine(sc);
3173 if (res != VKFFT_SUCCESS) return res;
3174 sc->tempLen = sprintf(sc->tempStr, " w.y=0;\n");
3175 res = VkAppendLine(sc);
3176 if (res != VKFFT_SUCCESS) return res;
3177 sprintf(sc->w, "w");
3178 uint64_t maxNonPow2Radix = 1;
3179 if (sc->fftDim % 3 == 0) maxNonPow2Radix = 3;
3180 if (sc->fftDim % 5 == 0) maxNonPow2Radix = 5;
3181 if (sc->fftDim % 7 == 0) maxNonPow2Radix = 7;
3182 if (sc->fftDim % 11 == 0) maxNonPow2Radix = 11;
3183 if (sc->fftDim % 13 == 0) maxNonPow2Radix = 13;
3184 for (uint64_t i = 0; i < maxNonPow2Radix; i++) {
3185 sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
3186 sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]);
3187 res = VkAppendLine(sc);
3188 if (res != VKFFT_SUCCESS) return res;
3189 sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->locID[i]);
3190 res = VkAppendLine(sc);
3191 if (res != VKFFT_SUCCESS) return res;
3192 sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->locID[i]);
3193 res = VkAppendLine(sc);
3194 if (res != VKFFT_SUCCESS) return res;
3195 }
3196 sprintf(sc->temp, "%s", sc->locID[0]);
3197 uint64_t useRadix8 = 0;
3198 for (uint64_t i = 0; i < sc->numStages; i++)
3199 if (sc->stageRadix[i] == 8) useRadix8 = 1;
3200 if (useRadix8 == 1) {
3201 if (maxNonPow2Radix > 1) sprintf(sc->iw, "%s", sc->locID[1]);
3202 else {
3203 sc->tempLen = sprintf(sc->tempStr, " %s iw;\n", vecType);
3204 res = VkAppendLine(sc);
3205 if (res != VKFFT_SUCCESS) return res;
3206 sc->tempLen = sprintf(sc->tempStr, " iw.x=0;\n");
3207 res = VkAppendLine(sc);
3208 if (res != VKFFT_SUCCESS) return res;
3209 sc->tempLen = sprintf(sc->tempStr, " iw.y=0;\n");
3210 res = VkAppendLine(sc);
3211 if (res != VKFFT_SUCCESS) return res;
3212 sprintf(sc->iw, "iw");
3213 }
3214 }
3215 //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->tempReg);
3216 sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->stageInvocationID);
3217 res = VkAppendLine(sc);
3218 if (res != VKFFT_SUCCESS) return res;
3219 sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->blockInvocationID);
3220 res = VkAppendLine(sc);
3221 if (res != VKFFT_SUCCESS) return res;
3222 sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->sdataID);
3223 res = VkAppendLine(sc);
3224 if (res != VKFFT_SUCCESS) return res;
3225 sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->combinedID);
3226 res = VkAppendLine(sc);
3227 if (res != VKFFT_SUCCESS) return res;
3228 sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->inoutID);
3229 res = VkAppendLine(sc);
3230 if (res != VKFFT_SUCCESS) return res;
3231 if (sc->LUT) {
3232 sc->tempLen = sprintf(sc->tempStr, " %s LUTId=0;\n", uintType);
3233 res = VkAppendLine(sc);
3234 if (res != VKFFT_SUCCESS) return res;
3235 }
3236 else {
3237 sc->tempLen = sprintf(sc->tempStr, " %s angle=0;\n", floatType);
3238 res = VkAppendLine(sc);
3239 if (res != VKFFT_SUCCESS) return res;
3240 }
3241 if (((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) {
3242 sc->tempLen = sprintf(sc->tempStr, " %s mult;\n", vecType);
3243 res = VkAppendLine(sc);
3244 if (res != VKFFT_SUCCESS) return res;
3245 sc->tempLen = sprintf(sc->tempStr, " mult.x = 0;\n");
3246 res = VkAppendLine(sc);
3247 if (res != VKFFT_SUCCESS) return res;
3248 sc->tempLen = sprintf(sc->tempStr, " mult.y = 0;\n");
3249 res = VkAppendLine(sc);
3250 if (res != VKFFT_SUCCESS) return res;
3251 }
3252 if (sc->cacheShuffle) {
3253 sc->tempLen = sprintf(sc->tempStr, "\
3254 %s tshuffle= ((%s>>1))%%(%" PRIu64 ");\n\
3255 %s shuffle[%" PRIu64 "];\n", uintType, sc->gl_LocalInvocationID_x, sc->registers_per_thread, vecType, sc->registers_per_thread);
3256 res = VkAppendLine(sc);
3257 if (res != VKFFT_SUCCESS) return res;
3258 for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
3259 /*sc->tempLen = sprintf(sc->tempStr, "\
3260shuffle[%" PRIu64 "];\n", i, vecType);*/
3261 sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].x = 0;\n", i);
3262 res = VkAppendLine(sc);
3263 if (res != VKFFT_SUCCESS) return res;
3264 sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].y = 0;\n", i);
3265 res = VkAppendLine(sc);
3266 if (res != VKFFT_SUCCESS) return res;
3267 }
3268 }
3269 return res;
3270}
3272 //return if sequence is full of zeros from the start
3274 if ((sc->frequencyZeropadding)) {
3275 switch (sc->axis_id) {
3276 case 0: {
3277 break;
3278 }
3279 case 1: {
3280 if (!sc->supportAxis) {
3281 char idX[500] = "";
3282 if (sc->performWorkGroupShift[0])
3283 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3284 else
3285 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3286 if (sc->performZeropaddingFull[0]) {
3287 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3288 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
3289 res = VkAppendLine(sc);
3290 if (res != VKFFT_SUCCESS) return res;
3291 }
3292 }
3293
3294 }
3295 break;
3296 }
3297 case 2: {
3298 if (!sc->supportAxis) {
3299 char idY[500] = "";
3300 if (sc->performWorkGroupShift[1])//y axis is along z workgroup here
3301 sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3302 else
3303 sprintf(idY, "%s", sc->gl_GlobalInvocationID_z);
3304
3305 char idX[500] = "";
3306 if (sc->performWorkGroupShift[0])
3307 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3308 else
3309 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3310 if (sc->performZeropaddingFull[0]) {
3311 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3312 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
3313 res = VkAppendLine(sc);
3314 if (res != VKFFT_SUCCESS) return res;
3315 }
3316 }
3317 if (sc->performZeropaddingFull[1]) {
3318 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3319 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3320 res = VkAppendLine(sc);
3321 if (res != VKFFT_SUCCESS) return res;
3322 }
3323 }
3324 }
3325 else {
3326 char idY[500] = "";
3327 if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup
3328 sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3329 else
3330 sprintf(idY, "%s", sc->gl_GlobalInvocationID_x);
3331 if (sc->performZeropaddingFull[1]) {
3332 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3333 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3334 res = VkAppendLine(sc);
3335 if (res != VKFFT_SUCCESS) return res;
3336 }
3337 }
3338 }
3339 break;
3340 }
3341 }
3342 }
3343 else {
3344 switch (sc->axis_id) {
3345 case 0: {
3346 char idY[500] = "";
3347 if (sc->axisSwapped) {
3348 if (sc->performWorkGroupShift[1])
3349 sprintf(idY, "(%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]);
3350 else
3351 sprintf(idY, "%s + %s * %" PRIu64 "", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]);
3352
3353 char idZ[500] = "";
3354 if (sc->performWorkGroupShift[2])
3355 sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3356 else
3357 sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
3358 if (sc->performZeropaddingFull[1]) {
3359 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3360 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3361 res = VkAppendLine(sc);
3362 if (res != VKFFT_SUCCESS) return res;
3363 }
3364 }
3365 if (sc->performZeropaddingFull[2]) {
3366 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3367 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
3368 res = VkAppendLine(sc);
3369 if (res != VKFFT_SUCCESS) return res;
3370 }
3371 }
3372 }
3373 else {
3374 if (sc->performWorkGroupShift[1])
3375 sprintf(idY, "(%s + consts.workGroupShiftY * %s)", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y);
3376 else
3377 sprintf(idY, "%s", sc->gl_GlobalInvocationID_y);
3378
3379 char idZ[500] = "";
3380 if (sc->performWorkGroupShift[2])
3381 sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3382 else
3383 sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
3384 if (sc->performZeropaddingFull[1]) {
3385 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3386 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3387 res = VkAppendLine(sc);
3388 if (res != VKFFT_SUCCESS) return res;
3389 }
3390 }
3391 if (sc->performZeropaddingFull[2]) {
3392 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3393 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
3394 res = VkAppendLine(sc);
3395 if (res != VKFFT_SUCCESS) return res;
3396 }
3397 }
3398 }
3399 break;
3400 }
3401 case 1: {
3402 char idZ[500] = "";
3403 if (sc->performWorkGroupShift[2])
3404 sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3405 else
3406 sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
3407 if (sc->performZeropaddingFull[2]) {
3408 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3409 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
3410 res = VkAppendLine(sc);
3411 if (res != VKFFT_SUCCESS) return res;
3412 }
3413 }
3414
3415 break;
3416 }
3417 case 2: {
3418
3419 break;
3420 }
3421 }
3422 }
3423 return res;
3424}
3426 //return if sequence is full of zeros from the start
3428 if ((sc->frequencyZeropadding)) {
3429 switch (sc->axis_id) {
3430 case 0: {
3431 break;
3432 }
3433 case 1: {
3434 if (!sc->supportAxis) {
3435 char idX[500] = "";
3436 if (sc->performWorkGroupShift[0])
3437 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3438 else
3439 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3440 if (sc->performZeropaddingFull[0]) {
3441 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3442 sc->tempLen = sprintf(sc->tempStr, " }\n");
3443 res = VkAppendLine(sc);
3444 if (res != VKFFT_SUCCESS) return res;
3445 }
3446 }
3447
3448 }
3449 break;
3450 }
3451 case 2: {
3452 if (!sc->supportAxis) {
3453 char idY[500] = "";
3454 if (sc->performWorkGroupShift[1])//y axis is along z workgroup here
3455 sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3456 else
3457 sprintf(idY, "%s", sc->gl_GlobalInvocationID_z);
3458
3459 char idX[500] = "";
3460 if (sc->performWorkGroupShift[0])
3461 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3462 else
3463 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3464 if (sc->performZeropaddingFull[0]) {
3465 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3466 sc->tempLen = sprintf(sc->tempStr, " }\n");
3467 res = VkAppendLine(sc);
3468 if (res != VKFFT_SUCCESS) return res;
3469 }
3470 }
3471 if (sc->performZeropaddingFull[1]) {
3472 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3473 sc->tempLen = sprintf(sc->tempStr, " }\n");
3474 res = VkAppendLine(sc);
3475 if (res != VKFFT_SUCCESS) return res;
3476 }
3477 }
3478 }
3479 else {
3480 char idY[500] = "";
3481 if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup
3482 sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3483 else
3484 sprintf(idY, "%s", sc->gl_GlobalInvocationID_x);
3485 if (sc->performZeropaddingFull[1]) {
3486 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3487 sc->tempLen = sprintf(sc->tempStr, " }\n");
3488 res = VkAppendLine(sc);
3489 if (res != VKFFT_SUCCESS) return res;
3490 }
3491 }
3492 }
3493 break;
3494 }
3495 }
3496 }
3497 else {
3498 switch (sc->axis_id) {
3499 case 0: {
3500 char idY[500] = "";
3501 if (sc->performZeropaddingFull[1]) {
3502 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3503 sc->tempLen = sprintf(sc->tempStr, " }\n");
3504 res = VkAppendLine(sc);
3505 if (res != VKFFT_SUCCESS) return res;
3506 }
3507 }
3508 if (sc->performZeropaddingFull[2]) {
3509 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3510 sc->tempLen = sprintf(sc->tempStr, " }\n");
3511 res = VkAppendLine(sc);
3512 if (res != VKFFT_SUCCESS) return res;
3513 }
3514 }
3515 break;
3516 }
3517 case 1: {
3518 char idZ[500] = "";
3519 if (sc->performWorkGroupShift[2])
3520 sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3521 else
3522 sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
3523 if (sc->performZeropaddingFull[2]) {
3524 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3525 sc->tempLen = sprintf(sc->tempStr, " }\n");
3526 res = VkAppendLine(sc);
3527 if (res != VKFFT_SUCCESS) return res;
3528 }
3529 }
3530 break;
3531 }
3532 case 2: {
3533
3534 break;
3535 }
3536 }
3537 }
3538 return res;
3539}
3540
3542 //return if sequence is full of zeros from the start
3544 if ((sc->frequencyZeropadding)) {
3545 switch (sc->axis_id) {
3546 case 0: {
3547 break;
3548 }
3549 case 1: {
3550 if (!sc->supportAxis) {
3551 char idX[500] = "";
3552 if (sc->performWorkGroupShift[0])
3553 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3554 else
3555 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3556 if (sc->performZeropaddingFull[0]) {
3557 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3558 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
3559 res = VkAppendLine(sc);
3560 if (res != VKFFT_SUCCESS) return res;
3561 }
3562 }
3563
3564 }
3565 break;
3566 }
3567 case 2: {
3568 if (!sc->supportAxis) {
3569 char idY[500] = "";
3570 if (sc->performWorkGroupShift[1])//y axis is along z workgroup here
3571 sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3572 else
3573 sprintf(idY, "%s", sc->gl_GlobalInvocationID_z);
3574
3575 char idX[500] = "";
3576 if (sc->performWorkGroupShift[0])
3577 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3578 else
3579 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3580 if (sc->performZeropaddingFull[0]) {
3581 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3582 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
3583 res = VkAppendLine(sc);
3584 if (res != VKFFT_SUCCESS) return res;
3585 }
3586 }
3587 if (sc->performZeropaddingFull[1]) {
3588 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3589 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3590 res = VkAppendLine(sc);
3591 if (res != VKFFT_SUCCESS) return res;
3592 }
3593 }
3594 }
3595 else {
3596 char idY[500] = "";
3597 if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup
3598 sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3599 else
3600 sprintf(idY, "%s", sc->gl_GlobalInvocationID_x);
3601 if (sc->performZeropaddingFull[1]) {
3602 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3603 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3604 res = VkAppendLine(sc);
3605 if (res != VKFFT_SUCCESS) return res;
3606 }
3607 }
3608 }
3609 break;
3610 }
3611 }
3612 }
3613 else {
3614 switch (sc->axis_id) {
3615 case 0: {
3616 char idY[500] = "";
3617 char idZ[500] = "";
3618 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
3619 if (readStage) {
3620 sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]);
3621 sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]);
3622 }
3623 else {
3624 sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]);
3625 sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]);
3626
3627 }
3628 if (sc->performZeropaddingFull[1]) {
3629 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3630 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
3631 res = VkAppendLine(sc);
3632 if (res != VKFFT_SUCCESS) return res;
3633 }
3634 }
3635 if (sc->performZeropaddingFull[2]) {
3636 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3637 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
3638 res = VkAppendLine(sc);
3639 if (res != VKFFT_SUCCESS) return res;
3640 }
3641 }
3642 break;
3643 }
3644 case 1: {
3645 char idZ[500] = "";
3646 if (sc->performWorkGroupShift[2])
3647 sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
3648 else
3649 sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
3650 if (sc->performZeropaddingFull[2]) {
3651 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3652 sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
3653 res = VkAppendLine(sc);
3654 if (res != VKFFT_SUCCESS) return res;
3655 }
3656 }
3657
3658 break;
3659 }
3660 case 2: {
3661
3662 break;
3663 }
3664 }
3665 }
3666 return res;
3667}
3669 //return if sequence is full of zeros from the start
3671 if ((sc->frequencyZeropadding)) {
3672 switch (sc->axis_id) {
3673 case 0: {
3674 break;
3675 }
3676 case 1: {
3677 char idX[500] = "";
3678 if (sc->performWorkGroupShift[0])
3679 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
3680 else
3681 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
3682 if (sc->performZeropaddingFull[0]) {
3683 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3684 sc->tempLen = sprintf(sc->tempStr, " }\n");
3685 res = VkAppendLine(sc);
3686 if (res != VKFFT_SUCCESS) return res;
3687 }
3688 }
3689 break;
3690 }
3691 case 2: {
3692 if (sc->performZeropaddingFull[0]) {
3693 if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
3694 sc->tempLen = sprintf(sc->tempStr, " }\n");
3695 res = VkAppendLine(sc);
3696 if (res != VKFFT_SUCCESS) return res;
3697 }
3698 }
3699 if (sc->performZeropaddingFull[1]) {
3700 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3701 sc->tempLen = sprintf(sc->tempStr, " }\n");
3702 res = VkAppendLine(sc);
3703 if (res != VKFFT_SUCCESS) return res;
3704 }
3705 }
3706 break;
3707 }
3708 }
3709 }
3710 else {
3711 switch (sc->axis_id) {
3712 case 0: {
3713 if (sc->performZeropaddingFull[1]) {
3714 if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
3715 sc->tempLen = sprintf(sc->tempStr, " }\n");
3716 res = VkAppendLine(sc);
3717 if (res != VKFFT_SUCCESS) return res;
3718 }
3719 }
3720 if (sc->performZeropaddingFull[2]) {
3721 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3722 sc->tempLen = sprintf(sc->tempStr, " }\n");
3723 res = VkAppendLine(sc);
3724 if (res != VKFFT_SUCCESS) return res;
3725 }
3726 }
3727 break;
3728 }
3729 case 1: {
3730 if (sc->performZeropaddingFull[2]) {
3731 if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
3732 sc->tempLen = sprintf(sc->tempStr, " }\n");
3733 res = VkAppendLine(sc);
3734 if (res != VKFFT_SUCCESS) return res;
3735 }
3736 }
3737 break;
3738 }
3739 case 2: {
3740
3741 break;
3742 }
3743 }
3744 }
3745 return res;
3746}
3747static inline VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) {
3749 //appendZeropadStart(sc);
3750 for (uint64_t k = 0; k < sc->registerBoost; k++) {
3751 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
3752 switch (readType) {
3753 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
3754 {
3755 if (sc->localSize[1] == 1)
3756 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
3757 else
3758 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
3759 res = VkAppendLine(sc);
3760 if (res != VKFFT_SUCCESS) return res;
3761
3762 if (sc->axisSwapped) {
3763 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
3764 res = VkAppendLine(sc);
3765 if (res != VKFFT_SUCCESS) return res;
3766 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
3767 res = VkAppendLine(sc);
3768 if (res != VKFFT_SUCCESS) return res;
3769 }
3770 else {
3771 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
3772 res = VkAppendLine(sc);
3773 if (res != VKFFT_SUCCESS) return res;
3774 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
3775 res = VkAppendLine(sc);
3776 if (res != VKFFT_SUCCESS) return res;
3777 }
3778 break;
3779 }
3780 case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://single_c2c
3781 {
3782 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
3783 res = VkAppendLine(sc);
3784 if (res != VKFFT_SUCCESS) return res;
3785 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
3786 res = VkAppendLine(sc);
3787 if (res != VKFFT_SUCCESS) return res;
3788
3789 break;
3790 }
3791 }
3792 }
3793 }
3794
3795
3796 //res = appendZeropadEnd(sc);
3797 //if (res != VKFFT_SUCCESS) return res;
3798 return res;
3799}
3802 switch (readType) {
3803 case 0: //single_c2c
3804 {
3805 if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->actualInverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim))
3806 sc->readToRegisters = 0;
3807 else
3808 sc->readToRegisters = 1;
3809 break;
3810 }
3811 case 1: //grouped_c2c
3812 {
3813 if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim)
3814 sc->readToRegisters = 0;
3815 else
3816 sc->readToRegisters = 1;
3817 break;
3818 }
3819 case 2: //single_c2c_strided
3820 {
3821 if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim)
3822 sc->readToRegisters = 0;
3823 else
3824 sc->readToRegisters = 1;
3825 break;
3826 }
3827 case 5://single_r2c
3828 {
3829 if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim))
3830 sc->readToRegisters = 0;
3831 else
3832 sc->readToRegisters = 1;
3833 break;
3834 }
3835 case 6: //single_c2r
3836 {
3837 sc->readToRegisters = 1;
3838 break;
3839 }
3840 case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143:
3841 {
3842 sc->readToRegisters = 0;
3843 break;
3844 }
3845 case 144: case 145:
3846 {
3847 sc->readToRegisters = 1;
3848 break;
3849 }
3850 }
3851 return res;
3852}
3853static inline VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) {
3855 double double_PI = 3.1415926535897932384626433832795;
3856 char vecType[30];
3857 char inputsStruct[20] = "";
3858 char LFending[4] = "";
3859 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
3860#if(VKFFT_BACKEND==0)
3861 if (sc->inputBufferBlockNum == 1)
3862 sprintf(inputsStruct, "inputs");
3863 else
3864 sprintf(inputsStruct, ".inputs");
3865 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
3866 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
3867 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
3868 char cosDef[20] = "cos";
3869 char sinDef[20] = "sin";
3870#elif(VKFFT_BACKEND==1)
3871 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3872 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3873 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
3874 sprintf(inputsStruct, "inputs");
3875 char cosDef[20] = "__cosf";
3876 char sinDef[20] = "__sinf";
3877#elif(VKFFT_BACKEND==2)
3878 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3879 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3880 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
3881 sprintf(inputsStruct, "inputs");
3882 char cosDef[20] = "__cosf";
3883 char sinDef[20] = "__sinf";
3884#elif(VKFFT_BACKEND==3)
3885 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
3886 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
3887 sprintf(inputsStruct, "inputs");
3888 char cosDef[20] = "native_cos";
3889 char sinDef[20] = "native_sin";
3890#endif
3891 char convTypeLeft[20] = "";
3892 char convTypeRight[20] = "";
3893 if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) {
3894 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3895#if(VKFFT_BACKEND==0)
3896 sprintf(convTypeLeft, "float(");
3897 sprintf(convTypeRight, ")");
3898#elif(VKFFT_BACKEND==1)
3899 sprintf(convTypeLeft, "(float)");
3900 //sprintf(convTypeRight, "");
3901#elif(VKFFT_BACKEND==2)
3902 sprintf(convTypeLeft, "(float)");
3903 //sprintf(convTypeRight, "");
3904#elif(VKFFT_BACKEND==3)
3905 sprintf(convTypeLeft, "(float)");
3906 //sprintf(convTypeRight, "");
3907#endif
3908 }
3909 else {
3910#if(VKFFT_BACKEND==0)
3911 sprintf(convTypeLeft, "vec2(");
3912 sprintf(convTypeRight, ")");
3913#elif(VKFFT_BACKEND==1)
3914 sprintf(convTypeLeft, "conv_float2(");
3915 sprintf(convTypeRight, ")");
3916#elif(VKFFT_BACKEND==2)
3917 sprintf(convTypeLeft, "conv_float2(");
3918 sprintf(convTypeRight, ")");
3919#elif(VKFFT_BACKEND==3)
3920 sprintf(convTypeLeft, "conv_float2(");
3921 sprintf(convTypeRight, ")");
3922#endif
3923 }
3924 }
3925 if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) {
3926 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3927#if(VKFFT_BACKEND==0)
3928 sprintf(convTypeLeft, "double(");
3929 sprintf(convTypeRight, ")");
3930#elif(VKFFT_BACKEND==1)
3931 sprintf(convTypeLeft, "(double)");
3932 //sprintf(convTypeRight, "");
3933#elif(VKFFT_BACKEND==2)
3934 sprintf(convTypeLeft, "(double)");
3935 //sprintf(convTypeRight, "");
3936#elif(VKFFT_BACKEND==3)
3937 sprintf(convTypeLeft, "(double)");
3938 //sprintf(convTypeRight, "");
3939#endif
3940 }
3941 else {
3942#if(VKFFT_BACKEND==0)
3943 sprintf(convTypeLeft, "dvec2(");
3944 sprintf(convTypeRight, ")");
3945#elif(VKFFT_BACKEND==1)
3946 sprintf(convTypeLeft, "conv_double2(");
3947 sprintf(convTypeRight, ")");
3948#elif(VKFFT_BACKEND==2)
3949 sprintf(convTypeLeft, "conv_double2(");
3950 sprintf(convTypeRight, ")");
3951#elif(VKFFT_BACKEND==3)
3952 sprintf(convTypeLeft, "conv_double2(");
3953 sprintf(convTypeRight, ")");
3954#endif
3955 }
3956 }
3957 char index_x[2000] = "";
3958 char index_y[2000] = "";
3959 char requestCoordinate[100] = "";
3960 if (sc->convolutionStep) {
3961 if (sc->matrixConvolution > 1) {
3962 sprintf(requestCoordinate, "coordinate");
3963 }
3964 }
3965 char requestBatch[100] = "";
3966 if (sc->convolutionStep) {
3967 if (sc->numKernels > 1) {
3968 sprintf(requestBatch, "0");//if one buffer - multiple kernel convolution
3969 }
3970 }
3971 //appendZeropadStart(sc);
3972 switch (readType) {
3973 case 0://single_c2c
3974 {
3975 //sc->tempLen = sprintf(sc->tempStr, " return;\n");
3976 char shiftX[500] = "";
3977 if (sc->performWorkGroupShift[0])
3978 sprintf(shiftX, " + consts.workGroupShiftX ");
3979 char shiftY[500] = "";
3980 if (sc->axisSwapped) {
3981 if (sc->performWorkGroupShift[1])
3982 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
3983 }
3984 else {
3985 if (sc->performWorkGroupShift[1])
3986 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
3987 }
3988 char shiftY2[100] = "";
3989 if (sc->performWorkGroupShift[1])
3990 sprintf(shiftY, " + consts.workGroupShiftY ");
3991 if (sc->fftDim < sc->fft_dim_full) {
3992 if (sc->axisSwapped) {
3993 sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / sc->min_registers_per_thread / (sc->firstStageStartSize / sc->fftDim), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
3994 //sc->tempLen = sprintf(sc->tempStr, " if (numActiveThreads>%" PRIu64 ") numActiveThreads = %" PRIu64 ";\n", sc->localSize[0]* sc->localSize[1], sc->localSize[0]* sc->localSize[1]);
3995 //sprintf(sc->disableThreadsStart, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
3996 res = VkAppendLine(sc);
3997 if (res != VKFFT_SUCCESS) return res;
3998 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
3999 sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
4000 res = VkAppendLine(sc);
4001 if (res != VKFFT_SUCCESS) return res;
4002 sprintf(sc->disableThreadsEnd, "}");
4003 }
4004 else {
4005 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
4007 if (res != VKFFT_SUCCESS) return res;
4008 sprintf(sc->disableThreadsEnd, "}");
4009 }
4010 }
4011 else {
4012 sc->tempLen = sprintf(sc->tempStr, " { \n");
4013 res = VkAppendLine(sc);
4014 if (res != VKFFT_SUCCESS) return res;
4015 }
4016
4017 if (sc->fftDim == sc->fft_dim_full) {
4018 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4019 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4020
4021 if (sc->localSize[1] == 1)
4022 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4023 else
4024 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
4025 res = VkAppendLine(sc);
4026 if (res != VKFFT_SUCCESS) return res;
4027 if (sc->inputStride[0] > 1)
4028 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]);
4029 else
4030 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->inputStride[1]);
4031 res = VkAppendLine(sc);
4032 if (res != VKFFT_SUCCESS) return res;
4033 if (sc->axisSwapped) {
4034 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
4035 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
4036 res = VkAppendLine(sc);
4037 if (res != VKFFT_SUCCESS) return res;
4038 }
4039 }
4040 else {
4041 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
4042 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
4043 res = VkAppendLine(sc);
4044 if (res != VKFFT_SUCCESS) return res;
4045 }
4046 }
4047 if (sc->zeropadBluestein[0]) {
4048 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
4049 res = VkAppendLine(sc);
4050 if (res != VKFFT_SUCCESS) return res;
4051 }
4052 if (sc->zeropad[0]) {
4053 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
4054 res = VkAppendLine(sc);
4055 if (res != VKFFT_SUCCESS) return res;
4056 }
4057 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4058 res = VkAppendLine(sc);
4059 if (res != VKFFT_SUCCESS) return res;
4060 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
4061 if (res != VKFFT_SUCCESS) return res;
4062 sc->tempLen = sprintf(sc->tempStr, ";\n");
4063 res = VkAppendLine(sc);
4064 if (res != VKFFT_SUCCESS) return res;
4066 if (res != VKFFT_SUCCESS) return res;
4067 if (sc->readToRegisters) {
4068 if (sc->inputBufferBlockNum == 1)
4069 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4070 else
4071 sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4072 }
4073 else {
4074 if (sc->axisSwapped) {
4075 if (sc->inputBufferBlockNum == 1)
4076 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4077 else
4078 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4079 }
4080 else {
4081 if (sc->inputBufferBlockNum == 1)
4082 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4083 else
4084 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4085 }
4086 }
4087 res = VkAppendLine(sc);
4088 if (res != VKFFT_SUCCESS) return res;
4090 if (res != VKFFT_SUCCESS) return res;
4091 if (sc->zeropad[0]) {
4092 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4093 res = VkAppendLine(sc);
4094 if (res != VKFFT_SUCCESS) return res;
4095 if (sc->readToRegisters) {
4096 sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4097 res = VkAppendLine(sc);
4098 if (res != VKFFT_SUCCESS) return res;
4099 }
4100 else {
4101 if (sc->axisSwapped) {
4102 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
4103 res = VkAppendLine(sc);
4104 if (res != VKFFT_SUCCESS) return res;
4105 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4106 res = VkAppendLine(sc);
4107 if (res != VKFFT_SUCCESS) return res;
4108 }
4109 else {
4110 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
4111 res = VkAppendLine(sc);
4112 if (res != VKFFT_SUCCESS) return res;
4113 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4114 res = VkAppendLine(sc);
4115 if (res != VKFFT_SUCCESS) return res;
4116 }
4117 }
4118 sc->tempLen = sprintf(sc->tempStr, " }\n");
4119 res = VkAppendLine(sc);
4120 if (res != VKFFT_SUCCESS) return res;
4121 }
4122 if (sc->zeropadBluestein[0]) {
4123 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4124 res = VkAppendLine(sc);
4125 if (res != VKFFT_SUCCESS) return res;
4126 if (sc->readToRegisters) {
4127 sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4128 res = VkAppendLine(sc);
4129 if (res != VKFFT_SUCCESS) return res;
4130 }
4131 else {
4132 if (sc->axisSwapped) {
4133 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
4134 res = VkAppendLine(sc);
4135 if (res != VKFFT_SUCCESS) return res;
4136 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4137 res = VkAppendLine(sc);
4138 if (res != VKFFT_SUCCESS) return res;
4139 }
4140 else {
4141 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
4142 res = VkAppendLine(sc);
4143 if (res != VKFFT_SUCCESS) return res;
4144 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4145 res = VkAppendLine(sc);
4146 if (res != VKFFT_SUCCESS) return res;
4147 }
4148 }
4149 sc->tempLen = sprintf(sc->tempStr, " }\n");
4150 res = VkAppendLine(sc);
4151 if (res != VKFFT_SUCCESS) return res;
4152 }
4153 if (sc->axisSwapped) {
4154 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
4155 sc->tempLen = sprintf(sc->tempStr, " }");
4156 res = VkAppendLine(sc);
4157 if (res != VKFFT_SUCCESS) return res;
4158 }
4159 }
4160 else {
4161 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
4162 sc->tempLen = sprintf(sc->tempStr, " }");
4163 res = VkAppendLine(sc);
4164 if (res != VKFFT_SUCCESS) return res;
4165 }
4166 }
4167
4168 }
4169 }
4170 }
4171 else {
4172 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4173 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4174 /*
4175 if (sc->localSize[1] == 1)
4176 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4177 else
4178 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
4179
4180 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
4181 */
4182 if (sc->axisSwapped) {
4183 if (sc->localSize[1] == 1)
4184 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4185 else
4186 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread));
4187 res = VkAppendLine(sc);
4188 if (res != VKFFT_SUCCESS) return res;
4189 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
4190 res = VkAppendLine(sc);
4191 if (res != VKFFT_SUCCESS) return res;
4192 }
4193 else {
4194 sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
4195 res = VkAppendLine(sc);
4196 if (res != VKFFT_SUCCESS) return res;
4197 }
4198 if (sc->zeropad[0]) {
4199 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
4200 res = VkAppendLine(sc);
4201 if (res != VKFFT_SUCCESS) return res;
4202 }
4203 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4204 res = VkAppendLine(sc);
4205 if (res != VKFFT_SUCCESS) return res;
4206 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
4207 if (res != VKFFT_SUCCESS) return res;
4208 sc->tempLen = sprintf(sc->tempStr, ";\n");
4209 res = VkAppendLine(sc);
4210 if (res != VKFFT_SUCCESS) return res;
4212 if (res != VKFFT_SUCCESS) return res;
4213 if (sc->readToRegisters) {
4214 if (sc->inputBufferBlockNum == 1)
4215 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4216 else
4217 sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4218 res = VkAppendLine(sc);
4219 if (res != VKFFT_SUCCESS) return res;
4220 }
4221 else {
4222 if (sc->axisSwapped) {
4223
4224 if (sc->inputBufferBlockNum == 1)
4225 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
4226 else
4227 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
4228 res = VkAppendLine(sc);
4229 if (res != VKFFT_SUCCESS) return res;
4230 }
4231 else {
4232 if (sc->inputBufferBlockNum == 1)
4233 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight);
4234 else
4235 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
4236 res = VkAppendLine(sc);
4237 if (res != VKFFT_SUCCESS) return res;
4238 }
4239 }
4241 if (res != VKFFT_SUCCESS) return res;
4242 if (sc->zeropad[0]) {
4243 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4244 res = VkAppendLine(sc);
4245 if (res != VKFFT_SUCCESS) return res;
4246 if (sc->readToRegisters) {
4247 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4248 res = VkAppendLine(sc);
4249 if (res != VKFFT_SUCCESS) return res;
4250 }
4251 else {
4252 if (sc->axisSwapped) {
4253 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
4254 res = VkAppendLine(sc);
4255 if (res != VKFFT_SUCCESS) return res;
4256 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4257 res = VkAppendLine(sc);
4258 if (res != VKFFT_SUCCESS) return res;
4259 }
4260 else {
4261 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4262 res = VkAppendLine(sc);
4263 if (res != VKFFT_SUCCESS) return res;
4264 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4265 res = VkAppendLine(sc);
4266 if (res != VKFFT_SUCCESS) return res;
4267 }
4268 }
4269 sc->tempLen = sprintf(sc->tempStr, " }\n");
4270 res = VkAppendLine(sc);
4271 if (res != VKFFT_SUCCESS) return res;
4272 }
4273 }
4274 }
4275 }
4276 sc->tempLen = sprintf(sc->tempStr, " }\n");
4277 res = VkAppendLine(sc);
4278 if (res != VKFFT_SUCCESS) return res;
4279 break;
4280 }
4281 case 1://grouped_c2c
4282 {
4283 char shiftX[500] = "";
4284 if (sc->performWorkGroupShift[0])
4285 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
4286
4287 sprintf(sc->disableThreadsStart, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
4289 if (res != VKFFT_SUCCESS) return res;
4290 sprintf(sc->disableThreadsEnd, "}");
4291 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4292 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4293 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
4294 res = VkAppendLine(sc);
4295 if (res != VKFFT_SUCCESS) return res;
4296 if (sc->zeropadBluestein[0]) {
4297 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
4298 res = VkAppendLine(sc);
4299 if (res != VKFFT_SUCCESS) return res;
4300 }
4301 if (sc->zeropad[0]) {
4302 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
4303 res = VkAppendLine(sc);
4304 if (res != VKFFT_SUCCESS) return res;
4305 }
4306 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4307 res = VkAppendLine(sc);
4308 if (res != VKFFT_SUCCESS) return res;
4309 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
4310 res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch);
4311 if (res != VKFFT_SUCCESS) return res;
4312 sc->tempLen = sprintf(sc->tempStr, ";\n");
4313 res = VkAppendLine(sc);
4314 if (res != VKFFT_SUCCESS) return res;
4316 if (res != VKFFT_SUCCESS) return res;
4317 if (sc->readToRegisters) {
4318 if (sc->inputBufferBlockNum == 1)
4319 sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4320 else
4321 sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4322 res = VkAppendLine(sc);
4323 if (res != VKFFT_SUCCESS) return res;
4324 }
4325 else {
4326 if (sc->inputBufferBlockNum == 1)
4327 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4328 else
4329 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4330 res = VkAppendLine(sc);
4331 if (res != VKFFT_SUCCESS) return res;
4332 }
4334 if (res != VKFFT_SUCCESS) return res;
4335 if (sc->zeropad[0]) {
4336 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4337 res = VkAppendLine(sc);
4338 if (res != VKFFT_SUCCESS) return res;
4339 if (sc->readToRegisters) {
4340 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4341 res = VkAppendLine(sc);
4342 if (res != VKFFT_SUCCESS) return res;
4343 }
4344 else {
4345 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4346 res = VkAppendLine(sc);
4347 if (res != VKFFT_SUCCESS) return res;
4348 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4349 res = VkAppendLine(sc);
4350 if (res != VKFFT_SUCCESS) return res;
4351 }
4352 sc->tempLen = sprintf(sc->tempStr, " }\n");
4353 res = VkAppendLine(sc);
4354 if (res != VKFFT_SUCCESS) return res;
4355 }
4356 if (sc->zeropadBluestein[0]) {
4357 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4358 res = VkAppendLine(sc);
4359 if (res != VKFFT_SUCCESS) return res;
4360 if (sc->readToRegisters) {
4361 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4362 res = VkAppendLine(sc);
4363 if (res != VKFFT_SUCCESS) return res;
4364 }
4365 else {
4366 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4367 res = VkAppendLine(sc);
4368 if (res != VKFFT_SUCCESS) return res;
4369 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4370 res = VkAppendLine(sc);
4371 if (res != VKFFT_SUCCESS) return res;
4372 }
4373 sc->tempLen = sprintf(sc->tempStr, " }\n");
4374 res = VkAppendLine(sc);
4375 if (res != VKFFT_SUCCESS) return res;
4376 }
4377 }
4378 }
4379 sc->tempLen = sprintf(sc->tempStr, " }\n");
4380 res = VkAppendLine(sc);
4381 if (res != VKFFT_SUCCESS) return res;
4382 break;
4383 }
4384 case 2://single_c2c_strided
4385 {
4386 char shiftX[500] = "";
4387 if (sc->performWorkGroupShift[0])
4388 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
4389
4390 //sc->tempLen = sprintf(sc->tempStr, " if(gl_GlobalInvolcationID.x%s >= %" PRIu64 ") return; \n", shiftX, sc->size[0] / axis->specializationConstants.fftDim);
4391 sprintf(sc->disableThreadsStart, " if (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full);
4393 if (res != VKFFT_SUCCESS) return res;
4394 sprintf(sc->disableThreadsEnd, "}");
4395 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4396 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4397 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim);
4398 res = VkAppendLine(sc);
4399 if (res != VKFFT_SUCCESS) return res;
4400 if (sc->zeropadBluestein[0]) {
4401 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
4402 res = VkAppendLine(sc);
4403 if (res != VKFFT_SUCCESS) return res;
4404 }
4405 if (sc->zeropad[0]) {
4406 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
4407 res = VkAppendLine(sc);
4408 if (res != VKFFT_SUCCESS) return res;
4409 }
4410 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4411 res = VkAppendLine(sc);
4412 if (res != VKFFT_SUCCESS) return res;
4413 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
4414 if (res != VKFFT_SUCCESS) return res;
4415 sc->tempLen = sprintf(sc->tempStr, ";\n");
4416 res = VkAppendLine(sc);
4417 if (res != VKFFT_SUCCESS) return res;
4419 if (res != VKFFT_SUCCESS) return res;
4420 if (sc->readToRegisters) {
4421 if (sc->inputBufferBlockNum == 1)
4422 sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4423 else
4424 sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4425 res = VkAppendLine(sc);
4426 if (res != VKFFT_SUCCESS) return res;
4427 }
4428 else {
4429 if (sc->inputBufferBlockNum == 1)
4430 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4431 else
4432 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4433 res = VkAppendLine(sc);
4434 if (res != VKFFT_SUCCESS) return res;
4435 }
4437 if (res != VKFFT_SUCCESS) return res;
4438 if (sc->zeropad[0]) {
4439 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4440 res = VkAppendLine(sc);
4441 if (res != VKFFT_SUCCESS) return res;
4442 if (sc->readToRegisters) {
4443 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4444 res = VkAppendLine(sc);
4445 if (res != VKFFT_SUCCESS) return res;
4446 }
4447 else {
4448 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4449 res = VkAppendLine(sc);
4450 if (res != VKFFT_SUCCESS) return res;
4451 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4452 res = VkAppendLine(sc);
4453 if (res != VKFFT_SUCCESS) return res;
4454 }
4455 sc->tempLen = sprintf(sc->tempStr, " }\n");
4456 res = VkAppendLine(sc);
4457 if (res != VKFFT_SUCCESS) return res;
4458 }
4459 if (sc->zeropadBluestein[0]) {
4460 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4461 res = VkAppendLine(sc);
4462 if (res != VKFFT_SUCCESS) return res;
4463 if (sc->readToRegisters) {
4464 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4465 res = VkAppendLine(sc);
4466 if (res != VKFFT_SUCCESS) return res;
4467 }
4468 else {
4469 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4470 res = VkAppendLine(sc);
4471 if (res != VKFFT_SUCCESS) return res;
4472 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
4473 res = VkAppendLine(sc);
4474 if (res != VKFFT_SUCCESS) return res;
4475 }
4476 sc->tempLen = sprintf(sc->tempStr, " }\n");
4477 res = VkAppendLine(sc);
4478 if (res != VKFFT_SUCCESS) return res;
4479 }
4480 }
4481 }
4482 sc->tempLen = sprintf(sc->tempStr, " }\n");
4483 res = VkAppendLine(sc);
4484 if (res != VKFFT_SUCCESS) return res;
4485 break;
4486 }
4487 case 5://single_r2c
4488 {
4489 char shiftX[500] = "";
4490 if (sc->performWorkGroupShift[0])
4491 sprintf(shiftX, " + consts.workGroupShiftX ");
4492 char shiftY[500] = "";
4493 if (sc->performWorkGroupShift[1])
4494 sprintf(shiftY, " + consts.workGroupShiftY ");
4495 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
4496 if (sc->fftDim == sc->fft_dim_full) {
4497 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4498 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4499
4500 if (sc->localSize[1] == 1)
4501 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
4502 else
4503 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
4504 res = VkAppendLine(sc);
4505 if (res != VKFFT_SUCCESS) return res;
4506
4507 if (sc->inputStride[0] > 1)
4508 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
4509 else
4510 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
4511 res = VkAppendLine(sc);
4512 if (res != VKFFT_SUCCESS) return res;
4513 if (sc->axisSwapped) {
4514 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
4515 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
4516 res = VkAppendLine(sc);
4517 if (res != VKFFT_SUCCESS) return res;
4518 }
4519 }
4520 else {
4521 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
4522 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
4523 res = VkAppendLine(sc);
4524 if (res != VKFFT_SUCCESS) return res;
4525 }
4526 }
4527 if (sc->zeropadBluestein[0]) {
4528 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
4529 res = VkAppendLine(sc);
4530 if (res != VKFFT_SUCCESS) return res;
4531 }
4532 if (sc->zeropad[0]) {
4533 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
4534 res = VkAppendLine(sc);
4535 if (res != VKFFT_SUCCESS) return res;
4536 }
4537 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4538 res = VkAppendLine(sc);
4539 if (res != VKFFT_SUCCESS) return res;
4540 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
4541 if (res != VKFFT_SUCCESS) return res;
4542 sc->tempLen = sprintf(sc->tempStr, ";\n");
4543 res = VkAppendLine(sc);
4544 if (res != VKFFT_SUCCESS) return res;
4546 if (res != VKFFT_SUCCESS) return res;
4547 if (sc->readToRegisters) {
4548 if (sc->inputBufferBlockNum == 1)
4549 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4550 else
4551 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4552 res = VkAppendLine(sc);
4553 if (res != VKFFT_SUCCESS) return res;
4554 if (sc->mergeSequencesR2C) {
4555 if (sc->inputBufferBlockNum == 1)
4556 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[(%s + %" PRIu64 ")]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, sc->inputStride[1], convTypeRight);
4557 else
4558 sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[(%s + %" PRIu64 ")/ %" PRIu64 "]%s[(%s + %" PRIu64 ") %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, convTypeRight);
4559 res = VkAppendLine(sc);
4560 if (res != VKFFT_SUCCESS) return res;
4561 }
4562 else {
4563 if (sc->inputBufferBlockNum == 1)
4564 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4565 else
4566 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4567 res = VkAppendLine(sc);
4568 if (res != VKFFT_SUCCESS) return res;
4569 }
4570 }
4571 else {
4572 if (sc->axisSwapped) {
4573
4574 if (sc->inputBufferBlockNum == 1)
4575 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4576 else
4577 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4578 res = VkAppendLine(sc);
4579 if (res != VKFFT_SUCCESS) return res;
4580
4581 if (sc->mergeSequencesR2C) {
4582 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
4583 res = VkAppendLine(sc);
4584 if (res != VKFFT_SUCCESS) return res;
4585 if (sc->inputBufferBlockNum == 1)
4586 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
4587 else
4588 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
4589 res = VkAppendLine(sc);
4590 if (res != VKFFT_SUCCESS) return res;
4591 }
4592 else {
4593 if (sc->inputBufferBlockNum == 1)
4594 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4595 else
4596 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4597 res = VkAppendLine(sc);
4598 if (res != VKFFT_SUCCESS) return res;
4599 }
4600 }
4601 else {
4602 if (sc->inputBufferBlockNum == 1)
4603 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
4604 else
4605 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
4606 res = VkAppendLine(sc);
4607 if (res != VKFFT_SUCCESS) return res;
4608 if (sc->mergeSequencesR2C) {
4609 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
4610 res = VkAppendLine(sc);
4611 if (res != VKFFT_SUCCESS) return res;
4612 if (sc->inputBufferBlockNum == 1)
4613 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
4614 else
4615 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
4616 res = VkAppendLine(sc);
4617 if (res != VKFFT_SUCCESS) return res;
4618 }
4619 else {
4620 if (sc->inputBufferBlockNum == 1)
4621 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4622 else
4623 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4624 res = VkAppendLine(sc);
4625 if (res != VKFFT_SUCCESS) return res;
4626 }
4627 }
4628
4629 }
4631 if (res != VKFFT_SUCCESS) return res;
4632 if (sc->zeropad[0]) {
4633 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4634 res = VkAppendLine(sc);
4635 if (res != VKFFT_SUCCESS) return res;
4636 if (sc->readToRegisters) {
4637 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4638 res = VkAppendLine(sc);
4639 if (res != VKFFT_SUCCESS) return res;
4640 }
4641 else {
4642 if (sc->axisSwapped) {
4643 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
4644 res = VkAppendLine(sc);
4645 if (res != VKFFT_SUCCESS) return res;
4646 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4647 res = VkAppendLine(sc);
4648 if (res != VKFFT_SUCCESS) return res;
4649 }
4650 else {
4651 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
4652 res = VkAppendLine(sc);
4653 if (res != VKFFT_SUCCESS) return res;
4654 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4655 res = VkAppendLine(sc);
4656 if (res != VKFFT_SUCCESS) return res;
4657
4658 }
4659
4660 }
4661 sc->tempLen = sprintf(sc->tempStr, " }\n");
4662 res = VkAppendLine(sc);
4663 if (res != VKFFT_SUCCESS) return res;
4664 }
4665 if (sc->zeropadBluestein[0]) {
4666 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4667 res = VkAppendLine(sc);
4668 if (res != VKFFT_SUCCESS) return res;
4669 if (sc->readToRegisters) {
4670 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4671 res = VkAppendLine(sc);
4672 if (res != VKFFT_SUCCESS) return res;
4673 }
4674 else {
4675 if (sc->axisSwapped) {
4676 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
4677 res = VkAppendLine(sc);
4678 if (res != VKFFT_SUCCESS) return res;
4679 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
4680 res = VkAppendLine(sc);
4681 if (res != VKFFT_SUCCESS) return res;
4682 }
4683 else {
4684 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
4685 res = VkAppendLine(sc);
4686 if (res != VKFFT_SUCCESS) return res;
4687 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
4688 res = VkAppendLine(sc);
4689 if (res != VKFFT_SUCCESS) return res;
4690
4691 }
4692
4693 }
4694 sc->tempLen = sprintf(sc->tempStr, " }\n");
4695 res = VkAppendLine(sc);
4696 if (res != VKFFT_SUCCESS) return res;
4697 }
4698 if (sc->axisSwapped) {
4699 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
4700 sc->tempLen = sprintf(sc->tempStr, " }");
4701 res = VkAppendLine(sc);
4702 if (res != VKFFT_SUCCESS) return res;
4703 }
4704 }
4705 else {
4706 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
4707 sc->tempLen = sprintf(sc->tempStr, " }");
4708 res = VkAppendLine(sc);
4709 if (res != VKFFT_SUCCESS) return res;
4710 }
4711 }
4712 }
4713 }
4714 }
4715 else {
4716 //Not implemented
4717 }
4718 break;
4719 }
4720 case 6: {//single_c2r
4721 //sc->tempLen = sprintf(sc->tempStr, " return;\n");
4722 char shiftX[500] = "";
4723 if (sc->performWorkGroupShift[0])
4724 sprintf(shiftX, " + consts.workGroupShiftX ");
4725 char shiftY[500] = "";
4726 if (sc->performWorkGroupShift[1])
4727 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
4728 char shiftY2[100] = "";
4729 if (sc->performWorkGroupShift[1])
4730 sprintf(shiftY, " + consts.workGroupShiftY ");
4731 if (sc->fftDim < sc->fft_dim_full) {
4732 if (sc->axisSwapped)
4733 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
4734 else
4735 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
4736
4738 if (res != VKFFT_SUCCESS) return res;
4739 sprintf(sc->disableThreadsEnd, "}");
4740 }
4741 else {
4742 sc->tempLen = sprintf(sc->tempStr, " { \n");
4743 res = VkAppendLine(sc);
4744 if (res != VKFFT_SUCCESS) return res;
4745 }
4746
4747 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
4748 if (sc->fftDim == sc->fft_dim_full) {
4750 for (uint64_t k = 0; k < sc->registerBoost; k++) {
4751 uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
4752 //num_in =(uint64_t)ceil(num_in / (double)sc->min_registers_per_thread);
4753 for (uint64_t i = 0; i < num_in; i++) {
4754 if (sc->localSize[1] == 1)
4755 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
4756 else
4757 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
4758 res = VkAppendLine(sc);
4759 if (res != VKFFT_SUCCESS) return res;
4760
4761 if (sc->inputStride[0] > 1)
4762 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->inputStride[0], sc->fftDim / 2 + 1, sc->inputStride[1]);
4763 else
4764 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->inputStride[1]);
4765 res = VkAppendLine(sc);
4766 if (res != VKFFT_SUCCESS) return res;
4767 if (sc->axisSwapped) {
4768 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
4769 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1]);
4770 res = VkAppendLine(sc);
4771 if (res != VKFFT_SUCCESS) return res;
4772 }
4773 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
4774 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]);
4775 res = VkAppendLine(sc);
4776 if (res != VKFFT_SUCCESS) return res;
4777 }
4778 }
4779 else {
4780 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
4781 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1]);
4782 res = VkAppendLine(sc);
4783 if (res != VKFFT_SUCCESS) return res;
4784 }
4785 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) {
4786 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]);
4787 res = VkAppendLine(sc);
4788 if (res != VKFFT_SUCCESS) return res;
4789 }
4790 }
4791 if (sc->zeropad[0]) {
4792 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
4793 res = VkAppendLine(sc);
4794 if (res != VKFFT_SUCCESS) return res;
4795 }
4796 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
4797 res = VkAppendLine(sc);
4798 if (res != VKFFT_SUCCESS) return res;
4799 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
4800 sc->tempLen = sprintf(sc->tempStr, ";\n");
4801 res = VkAppendLine(sc);
4802 if (res != VKFFT_SUCCESS) return res;
4804 if (res != VKFFT_SUCCESS) return res;
4805 if (0) {
4806 //not enabled
4807 if (sc->inputBufferBlockNum == 1)
4808 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4809 else
4810 sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4811 res = VkAppendLine(sc);
4812 if (res != VKFFT_SUCCESS) return res;
4813 }
4814 else {
4815 if (!sc->axisSwapped) {
4816 if (sc->inputBufferBlockNum == 1)
4817 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4818 else
4819 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4820 res = VkAppendLine(sc);
4821 if (res != VKFFT_SUCCESS) return res;
4822 }
4823 else {
4824 if (sc->inputBufferBlockNum == 1)
4825 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
4826 else
4827 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
4828 res = VkAppendLine(sc);
4829 if (res != VKFFT_SUCCESS) return res;
4830 }
4831 }
4833 if (res != VKFFT_SUCCESS) return res;
4834 if (sc->zeropad[0]) {
4835 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4836 res = VkAppendLine(sc);
4837 if (res != VKFFT_SUCCESS) return res;
4838 if (0) {
4839 //not enabled
4840 sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
4841 res = VkAppendLine(sc);
4842 if (res != VKFFT_SUCCESS) return res;
4843 }
4844 else {
4845 if (!sc->axisSwapped) {
4846 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
4847 res = VkAppendLine(sc);
4848 if (res != VKFFT_SUCCESS) return res;
4849 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
4850 res = VkAppendLine(sc);
4851 if (res != VKFFT_SUCCESS) return res;
4852 }
4853 else {
4854 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
4855 res = VkAppendLine(sc);
4856 if (res != VKFFT_SUCCESS) return res;
4857 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
4858 res = VkAppendLine(sc);
4859 if (res != VKFFT_SUCCESS) return res;
4860 }
4861 }
4862 sc->tempLen = sprintf(sc->tempStr, " }\n");
4863 res = VkAppendLine(sc);
4864 if (res != VKFFT_SUCCESS) return res;
4865 }
4866 if (sc->axisSwapped) {
4867 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
4868 sc->tempLen = sprintf(sc->tempStr, " }\n");
4869 res = VkAppendLine(sc);
4870 if (res != VKFFT_SUCCESS) return res;
4871 }
4872 }
4873 else {
4874 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) {
4875 sc->tempLen = sprintf(sc->tempStr, " }\n");
4876 res = VkAppendLine(sc);
4877 if (res != VKFFT_SUCCESS) return res;
4878 }
4879 }
4880 if (sc->axisSwapped) {
4881 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
4882 sc->tempLen = sprintf(sc->tempStr, " }\n");
4883 res = VkAppendLine(sc);
4884 if (res != VKFFT_SUCCESS) return res;
4885 }
4886 }
4887 else {
4888 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
4889 sc->tempLen = sprintf(sc->tempStr, " }\n");
4890 res = VkAppendLine(sc);
4891 if (res != VKFFT_SUCCESS) return res;
4892 }
4893 }
4894
4895 }
4896 res = appendBarrierVkFFT(sc, 1);
4897 if (res != VKFFT_SUCCESS) return res;
4898 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
4899 if (sc->mergeSequencesR2C) {
4900 if (sc->axisSwapped) {
4901 if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) {
4902 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4903 res = VkAppendLine(sc);
4904 if (res != VKFFT_SUCCESS) return res;
4905 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4906 res = VkAppendLine(sc);
4907 if (res != VKFFT_SUCCESS) return res;
4908 }
4909 else {
4910 if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) {
4911 if (((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) {
4912 if (sc->zeropadBluestein[0]) {
4913 sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y);
4914 res = VkAppendLine(sc);
4915 if (res != VKFFT_SUCCESS) return res;
4916 }
4917 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
4918 res = VkAppendLine(sc);
4919 if (res != VKFFT_SUCCESS) return res;
4920 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
4921 res = VkAppendLine(sc);
4922 if (res != VKFFT_SUCCESS) return res;
4923 if (sc->zeropadBluestein[0]) {
4924 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4925 res = VkAppendLine(sc);
4926 if (res != VKFFT_SUCCESS) return res;
4927 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4928 res = VkAppendLine(sc);
4929 if (res != VKFFT_SUCCESS) return res;
4930 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4931 res = VkAppendLine(sc);
4932 if (res != VKFFT_SUCCESS) return res;
4933 sc->tempLen = sprintf(sc->tempStr, " }\n");
4934 res = VkAppendLine(sc);
4935 if (res != VKFFT_SUCCESS) return res;
4936 }
4937 }
4938 else {
4939 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4940 res = VkAppendLine(sc);
4941 if (res != VKFFT_SUCCESS) return res;
4942 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
4943 res = VkAppendLine(sc);
4944 if (res != VKFFT_SUCCESS) return res;
4945 }
4946 }
4947 else {
4948 sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]);
4949 res = VkAppendLine(sc);
4950 if (res != VKFFT_SUCCESS) return res;
4951 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4952 res = VkAppendLine(sc);
4953 if (res != VKFFT_SUCCESS) return res;
4954 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4955 res = VkAppendLine(sc);
4956 if (res != VKFFT_SUCCESS) return res;
4957 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4958 res = VkAppendLine(sc);
4959 if (res != VKFFT_SUCCESS) return res;
4960 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
4961 res = VkAppendLine(sc);
4962 if (res != VKFFT_SUCCESS) return res;
4963 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
4964 res = VkAppendLine(sc);
4965 if (res != VKFFT_SUCCESS) return res;
4966 sc->tempLen = sprintf(sc->tempStr, " }\n");
4967 res = VkAppendLine(sc);
4968 if (res != VKFFT_SUCCESS) return res;
4969 }
4970 }
4971 }
4972 else {
4973 if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) {
4974 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4975 res = VkAppendLine(sc);
4976 if (res != VKFFT_SUCCESS) return res;
4977 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
4978 res = VkAppendLine(sc);
4979 if (res != VKFFT_SUCCESS) return res;
4980 }
4981 else {
4982 if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) {
4983 if (((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) {
4984 if (sc->zeropadBluestein[0]) {
4985 sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x);
4986 res = VkAppendLine(sc);
4987 if (res != VKFFT_SUCCESS) return res;
4988 }
4989 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
4990 res = VkAppendLine(sc);
4991 if (res != VKFFT_SUCCESS) return res;
4992 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
4993 res = VkAppendLine(sc);
4994 if (res != VKFFT_SUCCESS) return res;
4995 if (sc->zeropadBluestein[0]) {
4996 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
4997 res = VkAppendLine(sc);
4998 if (res != VKFFT_SUCCESS) return res;
4999 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5000 res = VkAppendLine(sc);
5001 if (res != VKFFT_SUCCESS) return res;
5002 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5003 res = VkAppendLine(sc);
5004 if (res != VKFFT_SUCCESS) return res;
5005 sc->tempLen = sprintf(sc->tempStr, " }\n");
5006 res = VkAppendLine(sc);
5007 if (res != VKFFT_SUCCESS) return res;
5008 }
5009 }
5010 else {
5011 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5012 res = VkAppendLine(sc);
5013 if (res != VKFFT_SUCCESS) return res;
5014 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5015 res = VkAppendLine(sc);
5016 if (res != VKFFT_SUCCESS) return res;
5017 }
5018 }
5019 else {
5020 sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]);
5021 res = VkAppendLine(sc);
5022 if (res != VKFFT_SUCCESS) return res;
5023 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
5024 res = VkAppendLine(sc);
5025 if (res != VKFFT_SUCCESS) return res;
5026 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
5027 res = VkAppendLine(sc);
5028 if (res != VKFFT_SUCCESS) return res;
5029 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5030 res = VkAppendLine(sc);
5031 if (res != VKFFT_SUCCESS) return res;
5032 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
5033 res = VkAppendLine(sc);
5034 if (res != VKFFT_SUCCESS) return res;
5035 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
5036 res = VkAppendLine(sc);
5037 if (res != VKFFT_SUCCESS) return res;
5038 sc->tempLen = sprintf(sc->tempStr, " }\n");
5039 res = VkAppendLine(sc);
5040 if (res != VKFFT_SUCCESS) return res;
5041 }
5042 }
5043 }
5044 }
5045 else {
5046 if (sc->axisSwapped) {
5047 if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) {
5048 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
5049 res = VkAppendLine(sc);
5050 if (res != VKFFT_SUCCESS) return res;
5051 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
5052 res = VkAppendLine(sc);
5053 if (res != VKFFT_SUCCESS) return res;
5054 }
5055 else {
5056 if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) {
5057 if (((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) {
5058 if (sc->zeropadBluestein[0]) {
5059 sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y);
5060 res = VkAppendLine(sc);
5061 if (res != VKFFT_SUCCESS) return res;
5062 }
5063 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
5064 res = VkAppendLine(sc);
5065 if (res != VKFFT_SUCCESS) return res;
5066 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
5067 res = VkAppendLine(sc);
5068 if (res != VKFFT_SUCCESS) return res;
5069 if (sc->zeropadBluestein[0]) {
5070 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5071 res = VkAppendLine(sc);
5072 if (res != VKFFT_SUCCESS) return res;
5073 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5074 res = VkAppendLine(sc);
5075 if (res != VKFFT_SUCCESS) return res;
5076 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5077 res = VkAppendLine(sc);
5078 if (res != VKFFT_SUCCESS) return res;
5079 sc->tempLen = sprintf(sc->tempStr, " }\n");
5080 res = VkAppendLine(sc);
5081 if (res != VKFFT_SUCCESS) return res;
5082 }
5083 }
5084 else {
5085 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5086 res = VkAppendLine(sc);
5087 if (res != VKFFT_SUCCESS) return res;
5088 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5089 res = VkAppendLine(sc);
5090 if (res != VKFFT_SUCCESS) return res;
5091 }
5092 }
5093 else {
5094 sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]);
5095 res = VkAppendLine(sc);
5096 if (res != VKFFT_SUCCESS) return res;
5097 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
5098 res = VkAppendLine(sc);
5099 if (res != VKFFT_SUCCESS) return res;
5100 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
5101 res = VkAppendLine(sc);
5102 if (res != VKFFT_SUCCESS) return res;
5103 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5104 res = VkAppendLine(sc);
5105 if (res != VKFFT_SUCCESS) return res;
5106 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y);
5107 res = VkAppendLine(sc);
5108 if (res != VKFFT_SUCCESS) return res;
5109 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y);
5110 res = VkAppendLine(sc);
5111 if (res != VKFFT_SUCCESS) return res;
5112 sc->tempLen = sprintf(sc->tempStr, " }\n");
5113 res = VkAppendLine(sc);
5114 if (res != VKFFT_SUCCESS) return res;
5115 }
5116 }
5117 }
5118 else {
5119 if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) {
5120 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
5121 res = VkAppendLine(sc);
5122 if (res != VKFFT_SUCCESS) return res;
5123 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
5124 res = VkAppendLine(sc);
5125 if (res != VKFFT_SUCCESS) return res;
5126 }
5127 else {
5128 if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) {
5129 if (((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) {
5130 if (sc->zeropadBluestein[0]) {
5131 sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x);
5132 res = VkAppendLine(sc);
5133 if (res != VKFFT_SUCCESS) return res;
5134 }
5135 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
5136 res = VkAppendLine(sc);
5137 if (res != VKFFT_SUCCESS) return res;
5138 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
5139 res = VkAppendLine(sc);
5140 if (res != VKFFT_SUCCESS) return res;
5141 if (sc->zeropadBluestein[0]) {
5142 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5143 res = VkAppendLine(sc);
5144 if (res != VKFFT_SUCCESS) return res;
5145 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5146 res = VkAppendLine(sc);
5147 if (res != VKFFT_SUCCESS) return res;
5148 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5149 res = VkAppendLine(sc);
5150 if (res != VKFFT_SUCCESS) return res;
5151 sc->tempLen = sprintf(sc->tempStr, " }\n");
5152 res = VkAppendLine(sc);
5153 if (res != VKFFT_SUCCESS) return res;
5154 }
5155 }
5156 else {
5157 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5158 res = VkAppendLine(sc);
5159 if (res != VKFFT_SUCCESS) return res;
5160 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
5161 res = VkAppendLine(sc);
5162 if (res != VKFFT_SUCCESS) return res;
5163 }
5164 }
5165 else {
5166 sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]);
5167 res = VkAppendLine(sc);
5168 if (res != VKFFT_SUCCESS) return res;
5169 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
5170 res = VkAppendLine(sc);
5171 if (res != VKFFT_SUCCESS) return res;
5172 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
5173 res = VkAppendLine(sc);
5174 if (res != VKFFT_SUCCESS) return res;
5175 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5176 res = VkAppendLine(sc);
5177 if (res != VKFFT_SUCCESS) return res;
5178 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x);
5179 res = VkAppendLine(sc);
5180 if (res != VKFFT_SUCCESS) return res;
5181 sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x);
5182 res = VkAppendLine(sc);
5183 if (res != VKFFT_SUCCESS) return res;
5184 sc->tempLen = sprintf(sc->tempStr, " }\n");
5185 res = VkAppendLine(sc);
5186 if (res != VKFFT_SUCCESS) return res;
5187 }
5188 }
5189 }
5190
5191 }
5192 }
5193 }
5194 //sc->readToRegisters = 1;
5195 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
5196 }
5197 else {
5198
5199 }
5200 sc->tempLen = sprintf(sc->tempStr, " }\n");
5201 res = VkAppendLine(sc);
5202 if (res != VKFFT_SUCCESS) return res;
5203 break;
5204 }
5205 case 110://DCT-I nonstrided
5206 {
5207 char shiftX[500] = "";
5208 if (sc->performWorkGroupShift[0])
5209 sprintf(shiftX, " + consts.workGroupShiftX ");
5210 char shiftY[500] = "";
5211 if (sc->performWorkGroupShift[1])
5212 sprintf(shiftY, " + consts.workGroupShiftY ");
5213 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
5214 if (sc->fftDim == sc->fft_dim_full) {
5215 if (sc->zeropadBluestein[0]) {
5216 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
5217 if (res != VKFFT_SUCCESS) return res;
5218 res = appendBarrierVkFFT(sc, 1);
5219 if (res != VKFFT_SUCCESS) return res;
5221 }
5222 sc->fftDim = (sc->fftDim + 2)/2;
5223 uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]);
5224 for (uint64_t k = 0; k < sc->registerBoost; k++) {
5225 for (uint64_t i = 0; i < num_in; i++) {
5226
5227 if (sc->localSize[1] == 1)
5228 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
5229 else
5230 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
5231 res = VkAppendLine(sc);
5232 if (res != VKFFT_SUCCESS) return res;
5233 if (sc->inputStride[0] > 1)
5234 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
5235 else
5236 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
5237 res = VkAppendLine(sc);
5238 if (res != VKFFT_SUCCESS) return res;
5239 if (sc->axisSwapped) {
5240 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
5241 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
5242 res = VkAppendLine(sc);
5243 if (res != VKFFT_SUCCESS) return res;
5244 }
5245 if (sc->zeropadBluestein[0]) {
5246 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
5247 res = VkAppendLine(sc);
5248 if (res != VKFFT_SUCCESS) return res;
5249 }
5250 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
5251 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
5252 res = VkAppendLine(sc);
5253 if (res != VKFFT_SUCCESS) return res;
5254 }
5255 }
5256 else {
5257 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
5258 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
5259 res = VkAppendLine(sc);
5260 if (res != VKFFT_SUCCESS) return res;
5261 }
5262 if (sc->zeropadBluestein[0]) {
5263 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
5264 res = VkAppendLine(sc);
5265 if (res != VKFFT_SUCCESS) return res;
5266 }
5267 if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) {
5268 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]);
5269 res = VkAppendLine(sc);
5270 if (res != VKFFT_SUCCESS) return res;
5271 }
5272 }
5273 if (sc->zeropad[0]) {
5274 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
5275 res = VkAppendLine(sc);
5276 if (res != VKFFT_SUCCESS) return res;
5277 }
5278 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
5279 res = VkAppendLine(sc);
5280 if (res != VKFFT_SUCCESS) return res;
5281 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
5282 if (res != VKFFT_SUCCESS) return res;
5283 sc->tempLen = sprintf(sc->tempStr, ";\n");
5284 res = VkAppendLine(sc);
5285 if (res != VKFFT_SUCCESS) return res;
5287 if (res != VKFFT_SUCCESS) return res;
5288 if (sc->axisSwapped) {
5289 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
5290 res = VkAppendLine(sc);
5291 if (res != VKFFT_SUCCESS) return res;
5292
5293 if (sc->inputBufferBlockNum == 1)
5294 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
5295 else
5296 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
5297 res = VkAppendLine(sc);
5298 if (res != VKFFT_SUCCESS) return res;
5299
5300 if (sc->mergeSequencesR2C) {
5301 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
5302 res = VkAppendLine(sc);
5303 if (res != VKFFT_SUCCESS) return res;
5304
5305 if (sc->inputBufferBlockNum == 1)
5306 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5307 else
5308 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5309 res = VkAppendLine(sc);
5310 if (res != VKFFT_SUCCESS) return res;
5311 }
5312 else {
5313 if (sc->inputBufferBlockNum == 1)
5314 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5315 else
5316 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5317 res = VkAppendLine(sc);
5318 if (res != VKFFT_SUCCESS) return res;
5319 }
5320 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim-1);
5321 res = VkAppendLine(sc);
5322 if (res != VKFFT_SUCCESS) return res;
5323 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2*sc->fftDim - 2, sc->fftDim, sc->fftDim);
5324 res = VkAppendLine(sc);
5325 if (res != VKFFT_SUCCESS) return res;
5326 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5327 res = VkAppendLine(sc);
5328 if (res != VKFFT_SUCCESS) return res;
5329 sc->tempLen = sprintf(sc->tempStr, " }\n");
5330 res = VkAppendLine(sc);
5331 if (res != VKFFT_SUCCESS) return res;
5332 }
5333 else {
5334 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
5335 res = VkAppendLine(sc);
5336 if (res != VKFFT_SUCCESS) return res;
5337 if (sc->inputBufferBlockNum == 1)
5338 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5339 else
5340 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5341 res = VkAppendLine(sc);
5342 if (res != VKFFT_SUCCESS) return res;
5343 if (sc->mergeSequencesR2C) {
5344 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
5345 res = VkAppendLine(sc);
5346 if (res != VKFFT_SUCCESS) return res;
5347 if (sc->inputBufferBlockNum == 1)
5348 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5349 else
5350 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5351 res = VkAppendLine(sc);
5352 if (res != VKFFT_SUCCESS) return res;
5353 }
5354 else {
5355 if (sc->inputBufferBlockNum == 1)
5356 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5357 else
5358 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5359 res = VkAppendLine(sc);
5360 if (res != VKFFT_SUCCESS) return res;
5361 }
5362 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
5363 res = VkAppendLine(sc);
5364 if (res != VKFFT_SUCCESS) return res;
5365 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
5366 res = VkAppendLine(sc);
5367 if (res != VKFFT_SUCCESS) return res;
5368 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5369 res = VkAppendLine(sc);
5370 if (res != VKFFT_SUCCESS) return res;
5371 sc->tempLen = sprintf(sc->tempStr, " }\n");
5372 res = VkAppendLine(sc);
5373 if (res != VKFFT_SUCCESS) return res;
5374 }
5376 if (res != VKFFT_SUCCESS) return res;
5377 if (sc->zeropadBluestein[0]) {
5378 sc->tempLen = sprintf(sc->tempStr, " }\n");
5379 res = VkAppendLine(sc);
5380 if (res != VKFFT_SUCCESS) return res;
5381 }
5382 if (sc->zeropad[0]) {
5383 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5384 res = VkAppendLine(sc);
5385 if (res != VKFFT_SUCCESS) return res;
5386
5387 if (sc->axisSwapped) {
5388 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
5389 res = VkAppendLine(sc);
5390 if (res != VKFFT_SUCCESS) return res;
5391 }
5392 else {
5393 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
5394 res = VkAppendLine(sc);
5395 if (res != VKFFT_SUCCESS) return res;
5396 }
5397
5398 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5399 res = VkAppendLine(sc);
5400 if (res != VKFFT_SUCCESS) return res;
5401 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5402 res = VkAppendLine(sc);
5403 if (res != VKFFT_SUCCESS) return res;
5404 if (sc->axisSwapped) {
5405 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
5406 res = VkAppendLine(sc);
5407 if (res != VKFFT_SUCCESS) return res;
5408 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
5409 res = VkAppendLine(sc);
5410 if (res != VKFFT_SUCCESS) return res;
5411 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5412 res = VkAppendLine(sc);
5413 if (res != VKFFT_SUCCESS) return res;
5414 sc->tempLen = sprintf(sc->tempStr, " }\n");
5415 res = VkAppendLine(sc);
5416 if (res != VKFFT_SUCCESS) return res;
5417 }
5418 else {
5419 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
5420 res = VkAppendLine(sc);
5421 if (res != VKFFT_SUCCESS) return res;
5422 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
5423 res = VkAppendLine(sc);
5424 if (res != VKFFT_SUCCESS) return res;
5425 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5426 res = VkAppendLine(sc);
5427 if (res != VKFFT_SUCCESS) return res;
5428 sc->tempLen = sprintf(sc->tempStr, " }\n");
5429 res = VkAppendLine(sc);
5430 if (res != VKFFT_SUCCESS) return res;
5431 }
5432 sc->tempLen = sprintf(sc->tempStr, " }\n");
5433 res = VkAppendLine(sc);
5434 if (res != VKFFT_SUCCESS) return res;
5435 }
5436 if (sc->axisSwapped) {
5437 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
5438 sc->tempLen = sprintf(sc->tempStr, " }\n");
5439 res = VkAppendLine(sc);
5440 if (res != VKFFT_SUCCESS) return res;
5441 }
5442 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
5443 sc->tempLen = sprintf(sc->tempStr, " }");
5444 res = VkAppendLine(sc);
5445 if (res != VKFFT_SUCCESS) return res;
5446 }
5447 }
5448 else {
5449 if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) {
5450 sc->tempLen = sprintf(sc->tempStr, " }\n");
5451 res = VkAppendLine(sc);
5452 if (res != VKFFT_SUCCESS) return res;
5453 }
5454 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
5455 sc->tempLen = sprintf(sc->tempStr, " }");
5456 res = VkAppendLine(sc);
5457 if (res != VKFFT_SUCCESS) return res;
5458 }
5459 }
5460 }
5461 }
5462 sc->fftDim = 2 * sc->fftDim - 2;
5463 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
5464 }
5465 else {
5466 //Not implemented
5467 }
5468 break;
5469 }
5470 case 111://DCT-I strided
5471 {
5472 char shiftX[500] = "";
5473 if (sc->performWorkGroupShift[0])
5474 sprintf(shiftX, " + consts.workGroupShiftX ");
5475 char shiftX2[500] = "";
5476 if (sc->performWorkGroupShift[0])
5477 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
5478 char shiftY[500] = "";
5479 if (sc->performWorkGroupShift[1])
5480 sprintf(shiftY, " + consts.workGroupShiftY ");
5481 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
5482 if (sc->fftDim == sc->fft_dim_full) {
5483 if (sc->zeropadBluestein[0]) {
5484 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
5485 if (res != VKFFT_SUCCESS) return res;
5486 res = appendBarrierVkFFT(sc, 1);
5487 if (res != VKFFT_SUCCESS) return res;
5489 }
5490 sc->fftDim = (sc->fftDim + 2)/2;
5491 uint64_t num_in = (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]);
5492 for (uint64_t k = 0; k < sc->registerBoost; k++) {
5493 for (uint64_t i = 0; i < num_in; i++) {
5494
5495 //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
5496 //res = VkAppendLine(sc);
5497 //if (res != VKFFT_SUCCESS) return res;
5498
5499 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
5500 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
5501 res = VkAppendLine(sc);
5502 if (res != VKFFT_SUCCESS) return res;
5503 }
5504 if (sc->mergeSequencesR2C)
5505 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult);
5506 else
5507 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
5508 res = VkAppendLine(sc);
5509 if (res != VKFFT_SUCCESS) return res;
5510 if (sc->zeropadBluestein[0]) {
5511 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
5512 res = VkAppendLine(sc);
5513 if (res != VKFFT_SUCCESS) return res;
5514 }
5515 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
5516 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim));
5517 res = VkAppendLine(sc);
5518 if (res != VKFFT_SUCCESS) return res;
5519 }
5520 if (sc->mergeSequencesR2C)
5521 sc->tempLen = sprintf(sc->tempStr, " //sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult);
5522 else
5523 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);
5524
5525 res = VkAppendLine(sc);
5526 if (res != VKFFT_SUCCESS) return res;
5527 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
5528 res = VkAppendLine(sc);
5529 if (res != VKFFT_SUCCESS) return res;
5530 if (sc->mergeSequencesR2C) {
5531 sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);
5532
5533 sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
5534 }
5535 else {
5536 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
5537 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
5538 }
5539 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5540 if (res != VKFFT_SUCCESS) return res;
5541 sc->tempLen = sprintf(sc->tempStr, ";\n");
5542 res = VkAppendLine(sc);
5543 if (res != VKFFT_SUCCESS) return res;
5545 if (res != VKFFT_SUCCESS) return res;
5546 if (sc->zeropad[0]) {
5547 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
5548 res = VkAppendLine(sc);
5549 if (res != VKFFT_SUCCESS) return res;
5550 }
5551 if (sc->inputBufferBlockNum == 1)
5552 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
5553 else
5554 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
5555 res = VkAppendLine(sc);
5556 if (res != VKFFT_SUCCESS) return res;
5557
5558 if (sc->mergeSequencesR2C) {
5559 sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
5560 res = VkAppendLine(sc);
5561 if (res != VKFFT_SUCCESS) return res;
5562 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
5563 res = VkAppendLine(sc);
5564 if (res != VKFFT_SUCCESS) return res;
5565 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
5566 res = VkAppendLine(sc);
5567 if (res != VKFFT_SUCCESS) return res;
5568 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
5569 res = VkAppendLine(sc);
5570 if (res != VKFFT_SUCCESS) return res;
5571 sc->tempLen = sprintf(sc->tempStr, " }\n");
5572 res = VkAppendLine(sc);
5573 if (res != VKFFT_SUCCESS) return res;
5574 }
5575 else {
5576 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
5577 res = VkAppendLine(sc);
5578 if (res != VKFFT_SUCCESS) return res;
5579 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5580 res = VkAppendLine(sc);
5581 if (res != VKFFT_SUCCESS) return res;
5582 }
5583 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
5584 res = VkAppendLine(sc);
5585 if (res != VKFFT_SUCCESS) return res;
5586 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x);
5587 res = VkAppendLine(sc);
5588 if (res != VKFFT_SUCCESS) return res;
5589 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5590 res = VkAppendLine(sc);
5591 if (res != VKFFT_SUCCESS) return res;
5592 sc->tempLen = sprintf(sc->tempStr, " }\n");
5593 res = VkAppendLine(sc);
5594 if (res != VKFFT_SUCCESS) return res;
5596 if (res != VKFFT_SUCCESS) return res;
5597 if (sc->zeropad[0]) {
5598 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5599 res = VkAppendLine(sc);
5600 if (res != VKFFT_SUCCESS) return res;
5601
5602 if (sc->mergeSequencesR2C) {
5603 sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
5604 res = VkAppendLine(sc);
5605 if (res != VKFFT_SUCCESS) return res;
5606 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5607 res = VkAppendLine(sc);
5608 if (res != VKFFT_SUCCESS) return res;
5609 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
5610 res = VkAppendLine(sc);
5611 if (res != VKFFT_SUCCESS) return res;
5612 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5613 res = VkAppendLine(sc);
5614 if (res != VKFFT_SUCCESS) return res;
5615 sc->tempLen = sprintf(sc->tempStr, " }\n");
5616 res = VkAppendLine(sc);
5617 if (res != VKFFT_SUCCESS) return res;
5618 }
5619 else {
5620 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5621 res = VkAppendLine(sc);
5622 if (res != VKFFT_SUCCESS) return res;
5623 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5624 res = VkAppendLine(sc);
5625 if (res != VKFFT_SUCCESS) return res;
5626 }
5627 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
5628 res = VkAppendLine(sc);
5629 if (res != VKFFT_SUCCESS) return res;
5630 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x);
5631 res = VkAppendLine(sc);
5632 if (res != VKFFT_SUCCESS) return res;
5633 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n");
5634 res = VkAppendLine(sc);
5635 if (res != VKFFT_SUCCESS) return res;
5636 sc->tempLen = sprintf(sc->tempStr, " }\n");
5637 res = VkAppendLine(sc);
5638 if (res != VKFFT_SUCCESS) return res;
5639 sc->tempLen = sprintf(sc->tempStr, " }\n");
5640 res = VkAppendLine(sc);
5641 if (res != VKFFT_SUCCESS) return res;
5642 }
5643 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
5644 sc->tempLen = sprintf(sc->tempStr, " }\n");
5645 res = VkAppendLine(sc);
5646 if (res != VKFFT_SUCCESS) return res;
5647 }
5648 if (sc->zeropadBluestein[0]) {
5649 sc->tempLen = sprintf(sc->tempStr, " }\n");
5650 res = VkAppendLine(sc);
5651 if (res != VKFFT_SUCCESS) return res;
5652 }
5653 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
5654 sc->tempLen = sprintf(sc->tempStr, " }\n");
5655 res = VkAppendLine(sc);
5656 if (res != VKFFT_SUCCESS) return res;
5657 }
5658 }
5659 }
5660 sc->fftDim = 2 * sc->fftDim - 2;
5661 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
5662 }
5663 else {
5664 //Not implemented
5665 }
5666 break;
5667 }
5668 case 120://DCT-II nonstrided
5669 {
5670 char shiftX[500] = "";
5671 if (sc->performWorkGroupShift[0])
5672 sprintf(shiftX, " + consts.workGroupShiftX ");
5673 char shiftY[500] = "";
5674 if (sc->performWorkGroupShift[1])
5675 sprintf(shiftY, " + consts.workGroupShiftY ");
5676 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
5677 if (sc->fftDim == sc->fft_dim_full) {
5678 if (sc->zeropadBluestein[0]) {
5679 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
5680 if (res != VKFFT_SUCCESS) return res;
5681 res = appendBarrierVkFFT(sc, 1);
5682 if (res != VKFFT_SUCCESS) return res;
5684 }
5685 for (uint64_t k = 0; k < sc->registerBoost; k++) {
5686 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
5687
5688 if (sc->localSize[1] == 1)
5689 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
5690 else
5691 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
5692 res = VkAppendLine(sc);
5693 if (res != VKFFT_SUCCESS) return res;
5694 if (sc->inputStride[0] > 1)
5695 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
5696 else
5697 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
5698 res = VkAppendLine(sc);
5699 if (res != VKFFT_SUCCESS) return res;
5700 if (sc->axisSwapped) {
5701 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
5702 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
5703 res = VkAppendLine(sc);
5704 if (res != VKFFT_SUCCESS) return res;
5705 }
5706 if (sc->zeropadBluestein[0]) {
5707 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
5708 res = VkAppendLine(sc);
5709 if (res != VKFFT_SUCCESS) return res;
5710 }
5711 }
5712 else {
5713 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
5714 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
5715 res = VkAppendLine(sc);
5716 if (res != VKFFT_SUCCESS) return res;
5717 }
5718 if (sc->zeropadBluestein[0]) {
5719 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
5720 res = VkAppendLine(sc);
5721 if (res != VKFFT_SUCCESS) return res;
5722 }
5723 }
5724 if (sc->zeropad[0]) {
5725 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
5726 res = VkAppendLine(sc);
5727 if (res != VKFFT_SUCCESS) return res;
5728 }
5729 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
5730 res = VkAppendLine(sc);
5731 if (res != VKFFT_SUCCESS) return res;
5732 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
5733 if (res != VKFFT_SUCCESS) return res;
5734 sc->tempLen = sprintf(sc->tempStr, ";\n");
5735 res = VkAppendLine(sc);
5736 if (res != VKFFT_SUCCESS) return res;
5738 if (res != VKFFT_SUCCESS) return res;
5739 if (sc->axisSwapped) {
5740 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
5741 res = VkAppendLine(sc);
5742 if (res != VKFFT_SUCCESS) return res;
5743
5744 if (sc->inputBufferBlockNum == 1)
5745 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
5746 else
5747 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
5748 res = VkAppendLine(sc);
5749 if (res != VKFFT_SUCCESS) return res;
5750
5751 if (sc->mergeSequencesR2C) {
5752 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
5753 res = VkAppendLine(sc);
5754 if (res != VKFFT_SUCCESS) return res;
5755
5756 if (sc->inputBufferBlockNum == 1)
5757 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5758 else
5759 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5760 res = VkAppendLine(sc);
5761 if (res != VKFFT_SUCCESS) return res;
5762 }
5763 else {
5764 if (sc->inputBufferBlockNum == 1)
5765 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5766 else
5767 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5768 res = VkAppendLine(sc);
5769 if (res != VKFFT_SUCCESS) return res;
5770 }
5771 }
5772 else {
5773 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
5774 res = VkAppendLine(sc);
5775 if (res != VKFFT_SUCCESS) return res;
5776 if (sc->inputBufferBlockNum == 1)
5777 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5778 else
5779 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5780 res = VkAppendLine(sc);
5781 if (res != VKFFT_SUCCESS) return res;
5782 if (sc->mergeSequencesR2C) {
5783 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
5784 res = VkAppendLine(sc);
5785 if (res != VKFFT_SUCCESS) return res;
5786 if (sc->inputBufferBlockNum == 1)
5787 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5788 else
5789 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
5790 res = VkAppendLine(sc);
5791 if (res != VKFFT_SUCCESS) return res;
5792 }
5793 else {
5794 if (sc->inputBufferBlockNum == 1)
5795 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5796 else
5797 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5798 res = VkAppendLine(sc);
5799 if (res != VKFFT_SUCCESS) return res;
5800 }
5801 }
5803 if (res != VKFFT_SUCCESS) return res;
5804 if (sc->zeropadBluestein[0]) {
5805 sc->tempLen = sprintf(sc->tempStr, " }\n");
5806 res = VkAppendLine(sc);
5807 if (res != VKFFT_SUCCESS) return res;
5808 }
5809 if (sc->zeropad[0]) {
5810 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5811 res = VkAppendLine(sc);
5812 if (res != VKFFT_SUCCESS) return res;
5813
5814 if (sc->axisSwapped) {
5815 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
5816 res = VkAppendLine(sc);
5817 if (res != VKFFT_SUCCESS) return res;
5818 }
5819 else {
5820 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
5821 res = VkAppendLine(sc);
5822 if (res != VKFFT_SUCCESS) return res;
5823 }
5824
5825 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5826 res = VkAppendLine(sc);
5827 if (res != VKFFT_SUCCESS) return res;
5828 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5829 res = VkAppendLine(sc);
5830 if (res != VKFFT_SUCCESS) return res;
5831 sc->tempLen = sprintf(sc->tempStr, " }\n");
5832 res = VkAppendLine(sc);
5833 if (res != VKFFT_SUCCESS) return res;
5834 }
5835 if (sc->axisSwapped) {
5836 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
5837 sc->tempLen = sprintf(sc->tempStr, " }");
5838 res = VkAppendLine(sc);
5839 if (res != VKFFT_SUCCESS) return res;
5840 }
5841 }
5842 else {
5843 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
5844 sc->tempLen = sprintf(sc->tempStr, " }");
5845 res = VkAppendLine(sc);
5846 if (res != VKFFT_SUCCESS) return res;
5847 }
5848 }
5849 }
5850 }
5851 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
5852 }
5853 else {
5854 //Not implemented
5855 }
5856 break;
5857 }
5858 case 121://DCT-II strided
5859 {
5860 char shiftX[500] = "";
5861 if (sc->performWorkGroupShift[0])
5862 sprintf(shiftX, " + consts.workGroupShiftX ");
5863 char shiftX2[500] = "";
5864 if (sc->performWorkGroupShift[0])
5865 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
5866 char shiftY[500] = "";
5867 if (sc->performWorkGroupShift[1])
5868 sprintf(shiftY, " + consts.workGroupShiftY ");
5869 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
5870 if (sc->fftDim == sc->fft_dim_full) {
5871 if (sc->zeropadBluestein[0]) {
5872 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
5873 if (res != VKFFT_SUCCESS) return res;
5874 res = appendBarrierVkFFT(sc, 1);
5875 if (res != VKFFT_SUCCESS) return res;
5877 }
5878 for (uint64_t k = 0; k < sc->registerBoost; k++) {
5879 for (uint64_t i = 0; i < mult * sc->min_registers_per_thread; i++) {
5880
5881 //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
5882 //res = VkAppendLine(sc);
5883 //if (res != VKFFT_SUCCESS) return res;
5884
5885 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
5886 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
5887 res = VkAppendLine(sc);
5888 if (res != VKFFT_SUCCESS) return res;
5889 }
5890 if (sc->mergeSequencesR2C)
5891 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult);
5892 else
5893 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
5894 res = VkAppendLine(sc);
5895 if (res != VKFFT_SUCCESS) return res;
5896 if (sc->zeropadBluestein[0]) {
5897 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
5898 res = VkAppendLine(sc);
5899 if (res != VKFFT_SUCCESS) return res;
5900 }
5901 if (sc->mergeSequencesR2C)
5902 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult);
5903 else
5904 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + %s;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x);
5905
5906 res = VkAppendLine(sc);
5907 if (res != VKFFT_SUCCESS) return res;
5908 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
5909 res = VkAppendLine(sc);
5910 if (res != VKFFT_SUCCESS) return res;
5911 if (sc->mergeSequencesR2C) {
5912 sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);
5913
5914 sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
5915 }
5916 else {
5917 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
5918 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
5919 }
5920 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5921 if (res != VKFFT_SUCCESS) return res;
5922 sc->tempLen = sprintf(sc->tempStr, ";\n");
5923 res = VkAppendLine(sc);
5924 if (res != VKFFT_SUCCESS) return res;
5926 if (res != VKFFT_SUCCESS) return res;
5927 if (sc->zeropad[0]) {
5928 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
5929 res = VkAppendLine(sc);
5930 if (res != VKFFT_SUCCESS) return res;
5931 }
5932 if (sc->inputBufferBlockNum == 1)
5933 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
5934 else
5935 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
5936 res = VkAppendLine(sc);
5937 if (res != VKFFT_SUCCESS) return res;
5938
5939 if (sc->mergeSequencesR2C) {
5940 sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
5941 res = VkAppendLine(sc);
5942 if (res != VKFFT_SUCCESS) return res;
5943 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
5944 res = VkAppendLine(sc);
5945 if (res != VKFFT_SUCCESS) return res;
5946 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
5947 res = VkAppendLine(sc);
5948 if (res != VKFFT_SUCCESS) return res;
5949 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
5950 res = VkAppendLine(sc);
5951 if (res != VKFFT_SUCCESS) return res;
5952 sc->tempLen = sprintf(sc->tempStr, " }\n");
5953 res = VkAppendLine(sc);
5954 if (res != VKFFT_SUCCESS) return res;
5955 }
5956 else {
5957 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
5958 res = VkAppendLine(sc);
5959 if (res != VKFFT_SUCCESS) return res;
5960 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5961 res = VkAppendLine(sc);
5962 if (res != VKFFT_SUCCESS) return res;
5963 }
5965 if (res != VKFFT_SUCCESS) return res;
5966 if (sc->zeropad[0]) {
5967 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
5968 res = VkAppendLine(sc);
5969 if (res != VKFFT_SUCCESS) return res;
5970
5971 if (sc->mergeSequencesR2C) {
5972 sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
5973 res = VkAppendLine(sc);
5974 if (res != VKFFT_SUCCESS) return res;
5975 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5976 res = VkAppendLine(sc);
5977 if (res != VKFFT_SUCCESS) return res;
5978 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
5979 res = VkAppendLine(sc);
5980 if (res != VKFFT_SUCCESS) return res;
5981 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5982 res = VkAppendLine(sc);
5983 if (res != VKFFT_SUCCESS) return res;
5984 sc->tempLen = sprintf(sc->tempStr, " }\n");
5985 res = VkAppendLine(sc);
5986 if (res != VKFFT_SUCCESS) return res;
5987 }
5988 else {
5989 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
5990 res = VkAppendLine(sc);
5991 if (res != VKFFT_SUCCESS) return res;
5992 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
5993 res = VkAppendLine(sc);
5994 if (res != VKFFT_SUCCESS) return res;
5995 }
5996 sc->tempLen = sprintf(sc->tempStr, " }\n");
5997 res = VkAppendLine(sc);
5998 if (res != VKFFT_SUCCESS) return res;
5999 }
6000 if (sc->zeropadBluestein[0]) {
6001 sc->tempLen = sprintf(sc->tempStr, " }\n");
6002 res = VkAppendLine(sc);
6003 if (res != VKFFT_SUCCESS) return res;
6004 }
6005 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
6006 sc->tempLen = sprintf(sc->tempStr, " }\n");
6007 res = VkAppendLine(sc);
6008 if (res != VKFFT_SUCCESS) return res;
6009 }
6010 }
6011 }
6012 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
6013 }
6014 else {
6015 //Not implemented
6016 }
6017 break;
6018 }
6019 case 130://DCT-III nonstrided
6020 {
6021 char shiftX[500] = "";
6022 if (sc->performWorkGroupShift[0])
6023 sprintf(shiftX, " + consts.workGroupShiftX ");
6024 char shiftY[500] = "";
6025 if (sc->performWorkGroupShift[1])
6026 sprintf(shiftY, " + consts.workGroupShiftY ");
6027 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
6028 if (sc->fftDim == sc->fft_dim_full) {
6029 if (sc->zeropadBluestein[0]) {
6030 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
6031 if (res != VKFFT_SUCCESS) return res;
6032 res = appendBarrierVkFFT(sc, 1);
6033 if (res != VKFFT_SUCCESS) return res;
6035 }
6036 uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
6037 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6038 for (uint64_t i = 0; i < num_in; i++) {
6039
6040 if (sc->localSize[1] == 1)
6041 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
6042 else
6043 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
6044 res = VkAppendLine(sc);
6045 if (res != VKFFT_SUCCESS) return res;
6046
6047 if (!sc->axisSwapped) {
6048 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
6049 }
6050 else {
6051 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
6052 }
6053 res = VkAppendLine(sc);
6054 if (res != VKFFT_SUCCESS) return res;
6055 if (sc->axisSwapped) {
6056 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
6057 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]);
6058 res = VkAppendLine(sc);
6059 if (res != VKFFT_SUCCESS) return res;
6060 }
6061 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
6062 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
6063 res = VkAppendLine(sc);
6064 if (res != VKFFT_SUCCESS) return res;
6065 }
6066 }
6067 else {
6068 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
6069 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]);
6070 res = VkAppendLine(sc);
6071 if (res != VKFFT_SUCCESS) return res;
6072 }
6073 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1]) {
6074 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]);
6075 res = VkAppendLine(sc);
6076 if (res != VKFFT_SUCCESS) return res;
6077 }
6078 }
6079 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6080 res = VkAppendLine(sc);
6081 if (res != VKFFT_SUCCESS) return res;
6082 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
6083 if (res != VKFFT_SUCCESS) return res;
6084 sc->tempLen = sprintf(sc->tempStr, ";\n");
6085 res = VkAppendLine(sc);
6086 if (res != VKFFT_SUCCESS) return res;
6088 if (res != VKFFT_SUCCESS) return res;
6089 if (sc->LUT) {
6090 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1);
6091 res = VkAppendLine(sc);
6092 if (res != VKFFT_SUCCESS) return res;
6093 }
6094 else {
6095 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
6096 res = VkAppendLine(sc);
6097 if (res != VKFFT_SUCCESS) return res;
6098 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
6099 res = VkAppendLine(sc);
6100 if (res != VKFFT_SUCCESS) return res;
6101 }
6102 if (sc->zeropad[0]) {
6103 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
6104 res = VkAppendLine(sc);
6105 if (res != VKFFT_SUCCESS) return res;
6106 }
6107 if (sc->inputBufferBlockNum == 1)
6108 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6109 else
6110 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6111 res = VkAppendLine(sc);
6112 if (res != VKFFT_SUCCESS) return res;
6113
6114 if (sc->mergeSequencesR2C) {
6115 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
6116 res = VkAppendLine(sc);
6117 if (res != VKFFT_SUCCESS) return res;
6118
6119 if (sc->inputBufferBlockNum == 1)
6120 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, convTypeRight);
6121 else
6122 sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
6123 res = VkAppendLine(sc);
6124 if (res != VKFFT_SUCCESS) return res;
6125 }
6126 else {
6127 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]);
6128 res = VkAppendLine(sc);
6129 if (res != VKFFT_SUCCESS) return res;
6130 }
6131 if (sc->zeropad[0]) {
6132 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6133 res = VkAppendLine(sc);
6134 if (res != VKFFT_SUCCESS) return res;
6135 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]);
6136 res = VkAppendLine(sc);
6137 if (res != VKFFT_SUCCESS) return res;
6138 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]);
6139 res = VkAppendLine(sc);
6140 if (res != VKFFT_SUCCESS) return res;
6141 sc->tempLen = sprintf(sc->tempStr, " }\n");
6142 res = VkAppendLine(sc);
6143 if (res != VKFFT_SUCCESS) return res;
6144 }
6145
6146 if (sc->axisSwapped) {
6147 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
6148 res = VkAppendLine(sc);
6149 if (res != VKFFT_SUCCESS) return res;
6150 }
6151 else {
6152 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
6153 res = VkAppendLine(sc);
6154 if (res != VKFFT_SUCCESS) return res;
6155 }
6156 sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
6157 res = VkAppendLine(sc);
6158 if (res != VKFFT_SUCCESS) return res;
6159
6160 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
6161 res = VkAppendLine(sc);
6162 if (res != VKFFT_SUCCESS) return res;
6163 if (sc->zeropad[0]) {
6164 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
6165 res = VkAppendLine(sc);
6166 if (res != VKFFT_SUCCESS) return res;
6167 }
6168 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6169 res = VkAppendLine(sc);
6170 if (res != VKFFT_SUCCESS) return res;
6171 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
6172 if (res != VKFFT_SUCCESS) return res;
6173 sc->tempLen = sprintf(sc->tempStr, ";\n");
6174 res = VkAppendLine(sc);
6175 if (res != VKFFT_SUCCESS) return res;
6176 if (sc->inputBufferBlockNum == 1)
6177 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6178 else
6179 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6180 res = VkAppendLine(sc);
6181 if (res != VKFFT_SUCCESS) return res;
6182
6183 if (sc->mergeSequencesR2C) {
6184 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
6185 res = VkAppendLine(sc);
6186 if (res != VKFFT_SUCCESS) return res;
6187
6188 if (sc->inputBufferBlockNum == 1)
6189 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, convTypeRight);
6190 else
6191 sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
6192 res = VkAppendLine(sc);
6193 if (res != VKFFT_SUCCESS) return res;
6194 }
6195 else {
6196 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]);
6197 res = VkAppendLine(sc);
6198 if (res != VKFFT_SUCCESS) return res;
6199 }
6200 if (sc->zeropad[0]) {
6201 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6202 res = VkAppendLine(sc);
6203 if (res != VKFFT_SUCCESS) return res;
6204 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]);
6205 res = VkAppendLine(sc);
6206 if (res != VKFFT_SUCCESS) return res;
6207 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]);
6208 res = VkAppendLine(sc);
6209 if (res != VKFFT_SUCCESS) return res;
6210 sc->tempLen = sprintf(sc->tempStr, " }\n");
6211 res = VkAppendLine(sc);
6212 if (res != VKFFT_SUCCESS) return res;
6213 }
6214 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
6215 res = VkAppendLine(sc);
6216 if (res != VKFFT_SUCCESS) return res;
6217 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
6218 res = VkAppendLine(sc);
6219 if (res != VKFFT_SUCCESS) return res;
6220 if (sc->axisSwapped) {
6221 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
6222 res = VkAppendLine(sc);
6223 if (res != VKFFT_SUCCESS) return res;
6224 }
6225 else {
6226 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
6227 res = VkAppendLine(sc);
6228 if (res != VKFFT_SUCCESS) return res;
6229 }
6230 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
6231 res = VkAppendLine(sc);
6232 if (res != VKFFT_SUCCESS) return res;
6233 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
6234 res = VkAppendLine(sc);
6235 if (res != VKFFT_SUCCESS) return res;
6236
6237 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
6238 res = VkAppendLine(sc);
6239
6240 if (res != VKFFT_SUCCESS) return res;
6241
6242 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
6243 res = VkAppendLine(sc);
6244 if (res != VKFFT_SUCCESS) return res;
6245 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
6246 res = VkAppendLine(sc);
6247 if (res != VKFFT_SUCCESS) return res;
6248 sc->tempLen = sprintf(sc->tempStr, " }\n");
6249
6250 res = VkAppendLine(sc);
6251 if (res != VKFFT_SUCCESS) return res;
6253 if (res != VKFFT_SUCCESS) return res;
6254
6255 if (sc->axisSwapped) {
6256 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
6257 sc->tempLen = sprintf(sc->tempStr, " }\n");
6258 res = VkAppendLine(sc);
6259 if (res != VKFFT_SUCCESS) return res;
6260 }
6261 }
6262 else {
6263 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1])
6264 {
6265 sc->tempLen = sprintf(sc->tempStr, " }\n");
6266 res = VkAppendLine(sc);
6267 if (res != VKFFT_SUCCESS) return res;
6268 }
6269 }
6270 if (sc->axisSwapped) {
6271 if (sc->size[1] % sc->localSize[0] != 0) {
6272 sc->tempLen = sprintf(sc->tempStr, " }\n");
6273 res = VkAppendLine(sc);
6274 if (res != VKFFT_SUCCESS) return res;
6275 }
6276 }
6277 else {
6278 if (sc->size[1] % sc->localSize[1] != 0) {
6279 sc->tempLen = sprintf(sc->tempStr, " }\n");
6280 res = VkAppendLine(sc);
6281 if (res != VKFFT_SUCCESS) return res;
6282 }
6283 }
6284 }
6285 }
6286 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
6287 }
6288 else {
6289 //Not implemented
6290 }
6291 break;
6292 }
6293 case 131://DCT-III strided
6294 {
6295 char shiftX[500] = "";
6296 if (sc->performWorkGroupShift[0])
6297 sprintf(shiftX, " + consts.workGroupShiftX ");
6298 char shiftX2[500] = "";
6299 if (sc->performWorkGroupShift[0])
6300 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
6301 char shiftY[500] = "";
6302 if (sc->performWorkGroupShift[1])
6303 sprintf(shiftY, " + consts.workGroupShiftY ");
6304 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
6305 uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]);
6306
6307 if (sc->fftDim == sc->fft_dim_full) {
6308 if (sc->zeropadBluestein[0]) {
6309 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
6310 if (res != VKFFT_SUCCESS) return res;
6311 res = appendBarrierVkFFT(sc, 1);
6312 if (res != VKFFT_SUCCESS) return res;
6314 }
6315 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6316 for (uint64_t i = 0; i < num_in; i++) {
6317
6318 //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
6319 //res = VkAppendLine(sc);
6320 //if (res != VKFFT_SUCCESS) return res;
6321
6322 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
6323 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
6324 res = VkAppendLine(sc);
6325 if (res != VKFFT_SUCCESS) return res;
6326 }
6327
6328 if (sc->mergeSequencesR2C)
6329 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult);
6330 else
6331 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
6332 res = VkAppendLine(sc);
6333 if (res != VKFFT_SUCCESS) return res;
6334 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1))
6335 {
6336 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1));
6337 res = VkAppendLine(sc);
6338 if (res != VKFFT_SUCCESS) return res;
6339 }
6340
6341 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6342 res = VkAppendLine(sc);
6343 if (res != VKFFT_SUCCESS) return res;
6344 if (sc->mergeSequencesR2C) {
6345 sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);
6346
6347 sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
6348 }
6349 else {
6350 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
6351 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
6352 }
6353 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6354 if (res != VKFFT_SUCCESS) return res;
6355 sc->tempLen = sprintf(sc->tempStr, ";\n");
6356 res = VkAppendLine(sc);
6357 if (res != VKFFT_SUCCESS) return res;
6358
6360 if (res != VKFFT_SUCCESS) return res;
6361 if (sc->zeropad[0]) {
6362 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
6363 res = VkAppendLine(sc);
6364 if (res != VKFFT_SUCCESS) return res;
6365 }
6366 if (sc->inputBufferBlockNum == 1)
6367 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6368 else
6369 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6370 res = VkAppendLine(sc);
6371 if (res != VKFFT_SUCCESS) return res;
6372
6373 if (sc->zeropad[0]) {
6374 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6375 res = VkAppendLine(sc);
6376 if (res != VKFFT_SUCCESS) return res;
6377 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]);
6378 res = VkAppendLine(sc);
6379 if (res != VKFFT_SUCCESS) return res;
6380 sc->tempLen = sprintf(sc->tempStr, " }\n");
6381 res = VkAppendLine(sc);
6382 if (res != VKFFT_SUCCESS) return res;
6383 }
6384 if (sc->mergeSequencesR2C) {
6385 }
6386 else {
6387 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]);
6388 res = VkAppendLine(sc);
6389 if (res != VKFFT_SUCCESS) return res;
6390 }
6391
6392 if (sc->LUT) {
6393 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID];\n", sc->startDCT3LUT);
6394 res = VkAppendLine(sc);
6395 if (res != VKFFT_SUCCESS) return res;
6396 }
6397 else {
6398 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (combinedID) );\n", cosDef, double_PI / 2 / sc->fftDim, LFending);
6399 res = VkAppendLine(sc);
6400 if (res != VKFFT_SUCCESS) return res;
6401 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (combinedID) );\n", sinDef, double_PI / 2 / sc->fftDim, LFending);
6402 res = VkAppendLine(sc);
6403 if (res != VKFFT_SUCCESS) return res;
6404 }
6405 //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f - %%f \\n\", mult.x, mult.y);\n");
6406 //res = VkAppendLine(sc);
6407 //if (res != VKFFT_SUCCESS) return res;
6408 if (sc->mergeSequencesR2C)
6409 sc->tempLen = sprintf(sc->tempStr, " //sdataID = (combinedID) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult);
6410 else
6411 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID) * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);
6412
6413 res = VkAppendLine(sc);
6414 if (res != VKFFT_SUCCESS) return res;
6415
6416 sc->tempLen = sprintf(sc->tempStr, " if (combinedID > 0){\n");
6417 res = VkAppendLine(sc);
6418 if (res != VKFFT_SUCCESS) return res;
6419
6420 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6421 res = VkAppendLine(sc);
6422 if (res != VKFFT_SUCCESS) return res;
6423 if (sc->mergeSequencesR2C) {
6424 sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);
6425
6426 sprintf(index_y, "(%" PRIu64 " - (%s/%" PRIu64 " + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
6427 }
6428 else {
6429 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
6430 sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
6431 }
6432 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6433 if (res != VKFFT_SUCCESS) return res;
6434 sc->tempLen = sprintf(sc->tempStr, ";\n");
6435 res = VkAppendLine(sc);
6436 if (res != VKFFT_SUCCESS) return res;
6437 if (sc->zeropad[0]) {
6438 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
6439 res = VkAppendLine(sc);
6440 if (res != VKFFT_SUCCESS) return res;
6441 }
6442 if (sc->inputBufferBlockNum == 1)
6443 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6444 else
6445 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6446 res = VkAppendLine(sc);
6447 if (res != VKFFT_SUCCESS) return res;
6448 if (sc->zeropad[0]) {
6449 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6450 res = VkAppendLine(sc);
6451 if (res != VKFFT_SUCCESS) return res;
6452 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]);
6453 res = VkAppendLine(sc);
6454 if (res != VKFFT_SUCCESS) return res;
6455 sc->tempLen = sprintf(sc->tempStr, " }\n");
6456 res = VkAppendLine(sc);
6457 if (res != VKFFT_SUCCESS) return res;
6458 }
6459 if (sc->mergeSequencesR2C) {
6460
6461 }
6462 else {
6463 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]);
6464 res = VkAppendLine(sc);
6465 if (res != VKFFT_SUCCESS) return res;
6466 }
6467
6468 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
6469 res = VkAppendLine(sc);
6470 if (res != VKFFT_SUCCESS) return res;
6471 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
6472 res = VkAppendLine(sc);
6473 if (res != VKFFT_SUCCESS) return res;
6474 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID) * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);
6475 res = VkAppendLine(sc);
6476 if (res != VKFFT_SUCCESS) return res;
6477 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
6478 res = VkAppendLine(sc);
6479 if (res != VKFFT_SUCCESS) return res;
6480 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = -((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
6481 res = VkAppendLine(sc);
6482 if (res != VKFFT_SUCCESS) return res;
6483
6484 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
6485 res = VkAppendLine(sc);
6486
6487 if (res != VKFFT_SUCCESS) return res;
6488
6489 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x)*mult.x-(%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
6490 res = VkAppendLine(sc);
6491 if (res != VKFFT_SUCCESS) return res;
6492 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y)*mult.x+(%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
6493 res = VkAppendLine(sc);
6494 if (res != VKFFT_SUCCESS) return res;
6495 sc->tempLen = sprintf(sc->tempStr, " }\n");
6496 res = VkAppendLine(sc);
6497 if (res != VKFFT_SUCCESS) return res;
6498
6500 if (res != VKFFT_SUCCESS) return res;
6501
6502 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
6503 sc->tempLen = sprintf(sc->tempStr, " }\n");
6504 res = VkAppendLine(sc);
6505 if (res != VKFFT_SUCCESS) return res;
6506 }
6507 if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1))
6508 {
6509 sc->tempLen = sprintf(sc->tempStr, " }\n");
6510 res = VkAppendLine(sc);
6511 if (res != VKFFT_SUCCESS) return res;
6512 }
6513 }
6514 }
6515 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
6516 }
6517 else {
6518 //Not implemented
6519 }
6520 break;
6521 }
6522 case 140://DCT-IV nonstrided cast to 8x FFT
6523 {
6524 char shiftX[500] = "";
6525 if (sc->performWorkGroupShift[0])
6526 sprintf(shiftX, " + consts.workGroupShiftX ");
6527 char shiftY[500] = "";
6528 if (sc->axisSwapped) {
6529 if (sc->performWorkGroupShift[1])
6530 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
6531 }
6532 else {
6533 if (sc->performWorkGroupShift[1])
6534 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
6535 }
6536 char shiftY2[100] = "";
6537 if (sc->performWorkGroupShift[1])
6538 sprintf(shiftY, " + consts.workGroupShiftY ");
6539 if (sc->fftDim < sc->fft_dim_full) {
6540 if (sc->axisSwapped) {
6541 sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / sc->min_registers_per_thread / (sc->firstStageStartSize / sc->fftDim), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
6542 res = VkAppendLine(sc);
6543 if (res != VKFFT_SUCCESS) return res;
6544 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
6545 sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
6546 res = VkAppendLine(sc);
6547 if (res != VKFFT_SUCCESS) return res;
6548 sprintf(sc->disableThreadsEnd, "}");
6549 }
6550 else {
6551 sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
6553 if (res != VKFFT_SUCCESS) return res;
6554 sprintf(sc->disableThreadsEnd, "}");
6555 }
6556 }
6557 else {
6558 sc->tempLen = sprintf(sc->tempStr, " { \n");
6559 res = VkAppendLine(sc);
6560 if (res != VKFFT_SUCCESS) return res;
6561 }
6562 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]);
6563 res = VkAppendLine(sc);
6564 if (res != VKFFT_SUCCESS) return res;
6565 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]);
6566 res = VkAppendLine(sc);
6567 if (res != VKFFT_SUCCESS) return res;
6568 if (sc->fftDim == sc->fft_dim_full) {
6569 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6570 for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {
6571
6572 if (sc->localSize[1] == 1)
6573 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
6574 else
6575 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
6576 res = VkAppendLine(sc);
6577 if (res != VKFFT_SUCCESS) return res;
6578 if (sc->inputStride[0] > 1)
6579 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->inputStride[0], sc->fftDim / 8, sc->inputStride[1]);
6580 else
6581 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->inputStride[1]);
6582 res = VkAppendLine(sc);
6583 if (res != VKFFT_SUCCESS) return res;
6584 if (sc->axisSwapped) {
6585 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
6586 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
6587 res = VkAppendLine(sc);
6588 if (res != VKFFT_SUCCESS) return res;
6589 }
6590 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]);
6591 res = VkAppendLine(sc);
6592 if (res != VKFFT_SUCCESS) return res;
6593 }
6594 else {
6595 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
6596 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
6597 res = VkAppendLine(sc);
6598 if (res != VKFFT_SUCCESS) return res;
6599 }
6600 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]);
6601 res = VkAppendLine(sc);
6602 if (res != VKFFT_SUCCESS) return res;
6603 }
6604 if (sc->zeropad[0]) {
6605 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
6606 res = VkAppendLine(sc);
6607 if (res != VKFFT_SUCCESS) return res;
6608 }
6609 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6610 res = VkAppendLine(sc);
6611 if (res != VKFFT_SUCCESS) return res;
6612 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
6613 if (res != VKFFT_SUCCESS) return res;
6614 sc->tempLen = sprintf(sc->tempStr, ";\n");
6615 res = VkAppendLine(sc);
6616 if (res != VKFFT_SUCCESS) return res;
6618 if (res != VKFFT_SUCCESS) return res;
6619
6620 if (sc->inputBufferBlockNum == 1)
6621 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6622 else
6623 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6624 res = VkAppendLine(sc);
6625 if (res != VKFFT_SUCCESS) return res;
6626 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]);
6627 res = VkAppendLine(sc);
6628 if (res != VKFFT_SUCCESS) return res;
6629 if (sc->axisSwapped) {
6630 sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6631 res = VkAppendLine(sc);
6632 if (res != VKFFT_SUCCESS) return res;
6633 sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6634 res = VkAppendLine(sc);
6635 if (res != VKFFT_SUCCESS) return res;
6636 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6637 res = VkAppendLine(sc);
6638 if (res != VKFFT_SUCCESS) return res;
6639 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6640 res = VkAppendLine(sc);
6641 if (res != VKFFT_SUCCESS) return res;
6642 sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
6643 res = VkAppendLine(sc);
6644 if (res != VKFFT_SUCCESS) return res;
6645 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6646 res = VkAppendLine(sc);
6647 if (res != VKFFT_SUCCESS) return res;
6648 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6649 res = VkAppendLine(sc);
6650 if (res != VKFFT_SUCCESS) return res;
6651 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6652 res = VkAppendLine(sc);
6653 if (res != VKFFT_SUCCESS) return res;
6654 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6655 res = VkAppendLine(sc);
6656 if (res != VKFFT_SUCCESS) return res;
6657 }
6658 else {
6659 sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6660 res = VkAppendLine(sc);
6661 if (res != VKFFT_SUCCESS) return res;
6662 sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6663 res = VkAppendLine(sc);
6664 if (res != VKFFT_SUCCESS) return res;
6665 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6666 res = VkAppendLine(sc);
6667 if (res != VKFFT_SUCCESS) return res;
6668 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6669 res = VkAppendLine(sc);
6670 if (res != VKFFT_SUCCESS) return res;
6671 sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
6672 res = VkAppendLine(sc);
6673 if (res != VKFFT_SUCCESS) return res;
6674 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6675 res = VkAppendLine(sc);
6676 if (res != VKFFT_SUCCESS) return res;
6677 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6678 res = VkAppendLine(sc);
6679 if (res != VKFFT_SUCCESS) return res;
6680 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
6681 res = VkAppendLine(sc);
6682 if (res != VKFFT_SUCCESS) return res;
6683 sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
6684 res = VkAppendLine(sc);
6685 if (res != VKFFT_SUCCESS) return res;
6686 }
6688 if (res != VKFFT_SUCCESS) return res;
6689 if (sc->zeropad[0]) {
6690 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6691 res = VkAppendLine(sc);
6692 if (res != VKFFT_SUCCESS) return res;
6693 if (sc->readToRegisters) {
6694 sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
6695 res = VkAppendLine(sc);
6696 if (res != VKFFT_SUCCESS) return res;
6697 }
6698 else {
6699 if (sc->axisSwapped) {
6700 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
6701 res = VkAppendLine(sc);
6702 if (res != VKFFT_SUCCESS) return res;
6703 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
6704 res = VkAppendLine(sc);
6705 if (res != VKFFT_SUCCESS) return res;
6706 }
6707 else {
6708 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
6709 res = VkAppendLine(sc);
6710 if (res != VKFFT_SUCCESS) return res;
6711 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
6712 res = VkAppendLine(sc);
6713 if (res != VKFFT_SUCCESS) return res;
6714 }
6715 }
6716 sc->tempLen = sprintf(sc->tempStr, " }\n");
6717 res = VkAppendLine(sc);
6718 if (res != VKFFT_SUCCESS) return res;
6719 }
6720 sc->tempLen = sprintf(sc->tempStr, " }\n");
6721 res = VkAppendLine(sc);
6722 if (res != VKFFT_SUCCESS) return res;
6723 if (sc->axisSwapped) {
6724 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
6725 sc->tempLen = sprintf(sc->tempStr, " }");
6726 res = VkAppendLine(sc);
6727 if (res != VKFFT_SUCCESS) return res;
6728 }
6729 }
6730 else {
6731 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
6732 sc->tempLen = sprintf(sc->tempStr, " }");
6733 res = VkAppendLine(sc);
6734 if (res != VKFFT_SUCCESS) return res;
6735 }
6736 }
6737
6738 }
6739 }
6740 }
6741 /*else {
6742 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6743 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
6744 if (sc->axisSwapped) {
6745 if (sc->localSize[1] == 1)
6746 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
6747 else
6748 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread));
6749 res = VkAppendLine(sc);
6750 if (res != VKFFT_SUCCESS) return res;
6751 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
6752 res = VkAppendLine(sc);
6753 if (res != VKFFT_SUCCESS) return res;
6754 }
6755 else {
6756 sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
6757 res = VkAppendLine(sc);
6758 if (res != VKFFT_SUCCESS) return res;
6759 }
6760 if (sc->zeropad[0]) {
6761 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
6762 res = VkAppendLine(sc);
6763 if (res != VKFFT_SUCCESS) return res;
6764 }
6765 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6766 res = VkAppendLine(sc);
6767 if (res != VKFFT_SUCCESS) return res;
6768 res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
6769 if (res != VKFFT_SUCCESS) return res;
6770 sc->tempLen = sprintf(sc->tempStr, ";\n");
6771 res = VkAppendLine(sc);
6772 if (res != VKFFT_SUCCESS) return res;
6773 res = appendZeropadStartReadWriteStage(sc, 1);
6774 if (res != VKFFT_SUCCESS) return res;
6775 if (sc->readToRegisters) {
6776 if (sc->inputBufferBlockNum == 1)
6777 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6778 else
6779 sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6780 res = VkAppendLine(sc);
6781 if (res != VKFFT_SUCCESS) return res;
6782 }
6783 else {
6784 if (sc->axisSwapped) {
6785
6786 if (sc->inputBufferBlockNum == 1)
6787 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
6788 else
6789 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
6790 res = VkAppendLine(sc);
6791 if (res != VKFFT_SUCCESS) return res;
6792 }
6793 else {
6794 if (sc->inputBufferBlockNum == 1)
6795 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight);
6796 else
6797 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
6798 res = VkAppendLine(sc);
6799 if (res != VKFFT_SUCCESS) return res;
6800 }
6801 }
6802 res = appendZeropadEndReadWriteStage(sc);
6803 if (res != VKFFT_SUCCESS) return res;
6804 if (sc->zeropad[0]) {
6805 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6806 res = VkAppendLine(sc);
6807 if (res != VKFFT_SUCCESS) return res;
6808 if (sc->readToRegisters) {
6809 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
6810 res = VkAppendLine(sc);
6811 if (res != VKFFT_SUCCESS) return res;
6812 }
6813 else {
6814 if (sc->axisSwapped) {
6815 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
6816 res = VkAppendLine(sc);
6817 if (res != VKFFT_SUCCESS) return res;
6818 sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
6819 res = VkAppendLine(sc);
6820 if (res != VKFFT_SUCCESS) return res;
6821 }
6822 else {
6823 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
6824 res = VkAppendLine(sc);
6825 if (res != VKFFT_SUCCESS) return res;
6826 sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
6827 res = VkAppendLine(sc);
6828 if (res != VKFFT_SUCCESS) return res;
6829 }
6830 }
6831 sc->tempLen = sprintf(sc->tempStr, " }\n");
6832 res = VkAppendLine(sc);
6833 if (res != VKFFT_SUCCESS) return res;
6834 }
6835 }
6836 }
6837 }*/
6838 sc->tempLen = sprintf(sc->tempStr, " }\n");
6839 res = VkAppendLine(sc);
6840 if (res != VKFFT_SUCCESS) return res;
6841 break;
6842 }
6843 case 141://DCT-IV strided cast to 8x FFT
6844 {
6845 char shiftX[500] = "";
6846 if (sc->performWorkGroupShift[0])
6847 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
6848 if (sc->fftDim != sc->fft_dim_full) {
6849 sprintf(sc->disableThreadsStart, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
6851 if (res != VKFFT_SUCCESS) return res;
6852
6853 sprintf(sc->disableThreadsEnd, "}");
6854 }
6855 else {
6856 sprintf(sc->disableThreadsStart, "{\n");
6858 if (res != VKFFT_SUCCESS) return res;
6859 sprintf(sc->disableThreadsEnd, "}");
6860 }
6861 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]);
6862 res = VkAppendLine(sc);
6863 if (res != VKFFT_SUCCESS) return res;
6864 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]);
6865 res = VkAppendLine(sc);
6866 if (res != VKFFT_SUCCESS) return res;
6867
6868 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6869 for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {
6870 if (sc->fftDim == sc->fft_dim_full)
6871 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
6872 else
6873 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
6874 res = VkAppendLine(sc);
6875 if (res != VKFFT_SUCCESS) return res;
6876
6877 sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8);
6878 res = VkAppendLine(sc);
6879 if (res != VKFFT_SUCCESS) return res;
6880 if (sc->zeropad[0]) {
6881 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
6882 res = VkAppendLine(sc);
6883 if (res != VKFFT_SUCCESS) return res;
6884 }
6885 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
6886 res = VkAppendLine(sc);
6887 if (res != VKFFT_SUCCESS) return res;
6888 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
6889 res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch);
6890 if (res != VKFFT_SUCCESS) return res;
6891 sc->tempLen = sprintf(sc->tempStr, ";\n");
6892 res = VkAppendLine(sc);
6893 if (res != VKFFT_SUCCESS) return res;
6894
6895 if (sc->inputBufferBlockNum == 1)
6896 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
6897 else
6898 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
6899 res = VkAppendLine(sc);
6900 if (res != VKFFT_SUCCESS) return res;
6901 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]);
6902 res = VkAppendLine(sc);
6903 if (res != VKFFT_SUCCESS) return res;
6904 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
6905 res = VkAppendLine(sc);
6906 if (res != VKFFT_SUCCESS) return res;
6907 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 ")+1)+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
6908 res = VkAppendLine(sc);
6909 if (res != VKFFT_SUCCESS) return res;
6910
6911 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
6912 res = VkAppendLine(sc);
6913 if (res != VKFFT_SUCCESS) return res;
6914 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
6915 res = VkAppendLine(sc);
6916 if (res != VKFFT_SUCCESS) return res;
6917 sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
6918 res = VkAppendLine(sc);
6919 if (res != VKFFT_SUCCESS) return res;
6920 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
6921 res = VkAppendLine(sc);
6922 if (res != VKFFT_SUCCESS) return res;
6923 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
6924 res = VkAppendLine(sc);
6925 if (res != VKFFT_SUCCESS) return res;
6926
6927 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
6928 res = VkAppendLine(sc);
6929 if (res != VKFFT_SUCCESS) return res;
6930 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 + 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
6931 res = VkAppendLine(sc);
6932 if (res != VKFFT_SUCCESS) return res;
6933
6934 if (sc->zeropad[0]) {
6935 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
6936 res = VkAppendLine(sc);
6937 if (res != VKFFT_SUCCESS) return res;
6938 if (sc->readToRegisters) {
6939 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
6940 res = VkAppendLine(sc);
6941 if (res != VKFFT_SUCCESS) return res;
6942 }
6943 else {
6944 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
6945 res = VkAppendLine(sc);
6946 if (res != VKFFT_SUCCESS) return res;
6947 sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
6948 res = VkAppendLine(sc);
6949 if (res != VKFFT_SUCCESS) return res;
6950 }
6951 sc->tempLen = sprintf(sc->tempStr, " }\n");
6952 res = VkAppendLine(sc);
6953 if (res != VKFFT_SUCCESS) return res;
6954 }
6955 sc->tempLen = sprintf(sc->tempStr, " }\n");
6956 res = VkAppendLine(sc);
6957 if (res != VKFFT_SUCCESS) return res;
6958 }
6959 }
6960 sc->tempLen = sprintf(sc->tempStr, " }\n");
6961 res = VkAppendLine(sc);
6962 if (res != VKFFT_SUCCESS) return res;
6963 break;
6964 }
6965 case 142://DCT-IV nonstrided as 2xN/2 DCT-II
6966 {
6967 char shiftX[500] = "";
6968 if (sc->performWorkGroupShift[0])
6969 sprintf(shiftX, " + consts.workGroupShiftX ");
6970 char shiftY[500] = "";
6971 if (sc->performWorkGroupShift[1])
6972 sprintf(shiftY, " + consts.workGroupShiftY ");
6973 if (sc->fftDim == sc->fft_dim_full) {
6974 if (sc->zeropadBluestein[0]) {
6975 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
6976 if (res != VKFFT_SUCCESS) return res;
6977 res = appendBarrierVkFFT(sc, 1);
6978 if (res != VKFFT_SUCCESS) return res;
6980 }
6981 uint64_t maxBluesteinCutOff = 1;
6982 if (sc->zeropadBluestein[0]) {
6983 if (sc->axisSwapped)
6984 maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[0];
6985 else
6986 maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[1];
6987 }
6988 for (uint64_t k = 0; k < sc->registerBoost; k++) {
6989 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
6990
6991 if (sc->localSize[1] == 1)
6992 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
6993 else
6994 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
6995 res = VkAppendLine(sc);
6996 if (res != VKFFT_SUCCESS) return res;
6997
6998 if (sc->inputStride[0] > 1)
6999 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
7000 else
7001 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
7002 res = VkAppendLine(sc);
7003 if (res != VKFFT_SUCCESS) return res;
7004 if (sc->axisSwapped) {
7005 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7006 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
7007 res = VkAppendLine(sc);
7008 if (res != VKFFT_SUCCESS) return res;
7009 }
7010 }
7011 else {
7012 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7013 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
7014 res = VkAppendLine(sc);
7015 if (res != VKFFT_SUCCESS) return res;
7016 }
7017 }
7018 if (sc->zeropadBluestein[0]) {
7019 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7020 res = VkAppendLine(sc);
7021 if (res != VKFFT_SUCCESS) return res;
7022 }
7023 if (sc->zeropad[0]) {
7024 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
7025 res = VkAppendLine(sc);
7026 if (res != VKFFT_SUCCESS) return res;
7027 }
7028 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
7029 res = VkAppendLine(sc);
7030 if (res != VKFFT_SUCCESS) return res;
7031 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
7032 if (res != VKFFT_SUCCESS) return res;
7033 sc->tempLen = sprintf(sc->tempStr, ";\n");
7034 res = VkAppendLine(sc);
7035 if (res != VKFFT_SUCCESS) return res;
7037 if (res != VKFFT_SUCCESS) return res;
7038#if(VKFFT_BACKEND!=3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
7039 if (sc->inputBufferBlockNum == 1)
7040 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7041 else
7042 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7043 res = VkAppendLine(sc);
7044 if (res != VKFFT_SUCCESS) return res;
7045#else
7046 if (i < sc->min_registers_per_thread) {
7047 if (sc->inputBufferBlockNum == 1)
7048 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7049 else
7050 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7051 res = VkAppendLine(sc);
7052 if (res != VKFFT_SUCCESS) return res;
7053 }
7054 else {
7055 if (sc->inputBufferBlockNum == 1)
7056 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7057 else
7058 sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7059 res = VkAppendLine(sc);
7060 if (res != VKFFT_SUCCESS) return res;
7061 }
7062#endif
7063#if(VKFFT_BACKEND!=3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around: we do writes in a separate stage
7064 if (sc->axisSwapped) {
7065 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7066 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7067 }
7068 else {
7069 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7070 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7071 }
7072 res = VkAppendLine(sc);
7073 if (res != VKFFT_SUCCESS) return res;
7074
7075 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7076 res = VkAppendLine(sc);
7077 if (res != VKFFT_SUCCESS) return res;
7078 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
7079 res = VkAppendLine(sc);
7080 if (res != VKFFT_SUCCESS) return res;
7081 sc->tempLen = sprintf(sc->tempStr, " }\n");
7082 res = VkAppendLine(sc);
7083 if (res != VKFFT_SUCCESS) return res;
7084 sc->tempLen = sprintf(sc->tempStr, " else {\n");
7085 res = VkAppendLine(sc);
7086 if (res != VKFFT_SUCCESS) return res;
7087 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
7088 res = VkAppendLine(sc);
7089 if (res != VKFFT_SUCCESS) return res;
7090 sc->tempLen = sprintf(sc->tempStr, " }\n");
7091 res = VkAppendLine(sc);
7092 if (res != VKFFT_SUCCESS) return res;
7093#endif
7095 if (res != VKFFT_SUCCESS) return res;
7096 if (sc->zeropad[0]) {
7097 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7098 res = VkAppendLine(sc);
7099 if (res != VKFFT_SUCCESS) return res;
7100
7101 if (sc->axisSwapped) {
7102 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7103 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7104 }
7105 else {
7106 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7107 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7108 }
7109 res = VkAppendLine(sc);
7110 if (res != VKFFT_SUCCESS) return res;
7111 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7112 res = VkAppendLine(sc);
7113 if (res != VKFFT_SUCCESS) return res;
7114 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
7115 res = VkAppendLine(sc);
7116 if (res != VKFFT_SUCCESS) return res;
7117 sc->tempLen = sprintf(sc->tempStr, " }\n");
7118 res = VkAppendLine(sc);
7119 if (res != VKFFT_SUCCESS) return res;
7120 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
7121 res = VkAppendLine(sc);
7122 if (res != VKFFT_SUCCESS) return res;
7123 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
7124 res = VkAppendLine(sc);
7125 if (res != VKFFT_SUCCESS) return res;
7126 sc->tempLen = sprintf(sc->tempStr, " }\n");
7127 res = VkAppendLine(sc);
7128 if (res != VKFFT_SUCCESS) return res;
7129 sc->tempLen = sprintf(sc->tempStr, " }\n");
7130 res = VkAppendLine(sc);
7131 if (res != VKFFT_SUCCESS) return res;
7132 }
7133 if (sc->zeropadBluestein[0]) {
7134 sc->tempLen = sprintf(sc->tempStr, " }\n");
7135 res = VkAppendLine(sc);
7136 if (res != VKFFT_SUCCESS) return res;
7137 }
7138 if (sc->axisSwapped) {
7139 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7140 sc->tempLen = sprintf(sc->tempStr, " }");
7141 res = VkAppendLine(sc);
7142 if (res != VKFFT_SUCCESS) return res;
7143 }
7144 }
7145 else {
7146 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7147 sc->tempLen = sprintf(sc->tempStr, " }");
7148 res = VkAppendLine(sc);
7149 if (res != VKFFT_SUCCESS) return res;
7150 }
7151 }
7152 }
7153 }
7154#if(VKFFT_BACKEND==3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
7155 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7156 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
7157
7158 if (sc->localSize[1] == 1)
7159 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
7160 else
7161 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7162 res = VkAppendLine(sc);
7163 if (res != VKFFT_SUCCESS) return res;
7164
7165 if (sc->inputStride[0] > 1)
7166 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
7167 else
7168 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
7169 res = VkAppendLine(sc);
7170 if (res != VKFFT_SUCCESS) return res;
7171 if (sc->axisSwapped) {
7172 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7173 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
7174 res = VkAppendLine(sc);
7175 if (res != VKFFT_SUCCESS) return res;
7176 }
7177 }
7178 else {
7179 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7180 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
7181 res = VkAppendLine(sc);
7182 if (res != VKFFT_SUCCESS) return res;
7183 }
7184 }
7185 if (sc->zeropadBluestein[0]) {
7186 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7187 res = VkAppendLine(sc);
7188 if (res != VKFFT_SUCCESS) return res;
7189 }
7190 if (sc->zeropad[0]) {
7191 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
7192 res = VkAppendLine(sc);
7193 if (res != VKFFT_SUCCESS) return res;
7194 }
7195 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
7196 res = VkAppendLine(sc);
7197 if (res != VKFFT_SUCCESS) return res;
7198 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
7199 if (res != VKFFT_SUCCESS) return res;
7200 sc->tempLen = sprintf(sc->tempStr, ";\n");
7201 res = VkAppendLine(sc);
7202 if (res != VKFFT_SUCCESS) return res;
7204 if (res != VKFFT_SUCCESS) return res;
7205 if (sc->axisSwapped) {
7206 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7207 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7208 }
7209 else {
7210 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7211 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7212 }
7213 res = VkAppendLine(sc);
7214 if (res != VKFFT_SUCCESS) return res;
7215 if (i < sc->min_registers_per_thread) {
7216 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7217 res = VkAppendLine(sc);
7218 if (res != VKFFT_SUCCESS) return res;
7219 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7220 res = VkAppendLine(sc);
7221 if (res != VKFFT_SUCCESS) return res;
7222 sc->tempLen = sprintf(sc->tempStr, " }\n");
7223 res = VkAppendLine(sc);
7224 if (res != VKFFT_SUCCESS) return res;
7225 }
7226 else {
7227 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7228 res = VkAppendLine(sc);
7229 if (res != VKFFT_SUCCESS) return res;
7230 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread]);
7231 res = VkAppendLine(sc);
7232 if (res != VKFFT_SUCCESS) return res;
7233 sc->tempLen = sprintf(sc->tempStr, " }\n");
7234 res = VkAppendLine(sc);
7235 if (res != VKFFT_SUCCESS) return res;
7236 }
7238 if (res != VKFFT_SUCCESS) return res;
7239 if (sc->zeropad[0]) {
7240 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7241 res = VkAppendLine(sc);
7242 if (res != VKFFT_SUCCESS) return res;
7243
7244 if (sc->axisSwapped) {
7245 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7246 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7247 }
7248 else {
7249 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7250 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7251 }
7252 res = VkAppendLine(sc);
7253 if (res != VKFFT_SUCCESS) return res;
7254 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7255 res = VkAppendLine(sc);
7256 if (res != VKFFT_SUCCESS) return res;
7257 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
7258 res = VkAppendLine(sc);
7259 if (res != VKFFT_SUCCESS) return res;
7260 sc->tempLen = sprintf(sc->tempStr, " }\n");
7261 res = VkAppendLine(sc);
7262 if (res != VKFFT_SUCCESS) return res;
7263 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
7264 res = VkAppendLine(sc);
7265 if (res != VKFFT_SUCCESS) return res;
7266 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
7267 res = VkAppendLine(sc);
7268 if (res != VKFFT_SUCCESS) return res;
7269 sc->tempLen = sprintf(sc->tempStr, " }\n");
7270 res = VkAppendLine(sc);
7271 if (res != VKFFT_SUCCESS) return res;
7272 sc->tempLen = sprintf(sc->tempStr, " }\n");
7273 res = VkAppendLine(sc);
7274 if (res != VKFFT_SUCCESS) return res;
7275 }
7276 if (sc->zeropadBluestein[0]) {
7277 sc->tempLen = sprintf(sc->tempStr, " }\n");
7278 res = VkAppendLine(sc);
7279 if (res != VKFFT_SUCCESS) return res;
7280 }
7281 if (sc->axisSwapped) {
7282 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7283 sc->tempLen = sprintf(sc->tempStr, " }");
7284 res = VkAppendLine(sc);
7285 if (res != VKFFT_SUCCESS) return res;
7286 }
7287 }
7288 else {
7289 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7290 sc->tempLen = sprintf(sc->tempStr, " }");
7291 res = VkAppendLine(sc);
7292 if (res != VKFFT_SUCCESS) return res;
7293 }
7294 }
7295 }
7296 }
7297 res = appendBarrierVkFFT(sc, 1);
7298 if (res != VKFFT_SUCCESS) return res;
7299 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7300 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
7301
7302 if (sc->localSize[1] == 1)
7303 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
7304 else
7305 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7306 res = VkAppendLine(sc);
7307 if (res != VKFFT_SUCCESS) return res;
7308
7309 if (sc->inputStride[0] > 1)
7310 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
7311 else
7312 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
7313 res = VkAppendLine(sc);
7314 if (res != VKFFT_SUCCESS) return res;
7315 if (sc->axisSwapped) {
7316 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7317 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
7318 res = VkAppendLine(sc);
7319 if (res != VKFFT_SUCCESS) return res;
7320 }
7321 }
7322 else {
7323 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7324 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
7325 res = VkAppendLine(sc);
7326 if (res != VKFFT_SUCCESS) return res;
7327 }
7328 }
7329 if (sc->zeropadBluestein[0]) {
7330 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7331 res = VkAppendLine(sc);
7332 if (res != VKFFT_SUCCESS) return res;
7333 }
7334 if (sc->zeropad[0]) {
7335 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
7336 res = VkAppendLine(sc);
7337 if (res != VKFFT_SUCCESS) return res;
7338 }
7339 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
7340 res = VkAppendLine(sc);
7341 if (res != VKFFT_SUCCESS) return res;
7342 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
7343 if (res != VKFFT_SUCCESS) return res;
7344 sc->tempLen = sprintf(sc->tempStr, ";\n");
7345 res = VkAppendLine(sc);
7346 if (res != VKFFT_SUCCESS) return res;
7348 if (res != VKFFT_SUCCESS) return res;
7349 if (sc->axisSwapped) {
7350 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7351 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7352 }
7353 else {
7354 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7355 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7356 }
7357 res = VkAppendLine(sc);
7358 if (res != VKFFT_SUCCESS) return res;
7359 if (i < sc->min_registers_per_thread) {
7360 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);
7361 res = VkAppendLine(sc);
7362 if (res != VKFFT_SUCCESS) return res;
7363 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7364 res = VkAppendLine(sc);
7365 if (res != VKFFT_SUCCESS) return res;
7366 sc->tempLen = sprintf(sc->tempStr, " }\n");
7367 res = VkAppendLine(sc);
7368 if (res != VKFFT_SUCCESS) return res;
7369 }
7370 else {
7371 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);
7372 res = VkAppendLine(sc);
7373 if (res != VKFFT_SUCCESS) return res;
7374 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread]);
7375 res = VkAppendLine(sc);
7376 if (res != VKFFT_SUCCESS) return res;
7377 sc->tempLen = sprintf(sc->tempStr, " }\n");
7378 res = VkAppendLine(sc);
7379 if (res != VKFFT_SUCCESS) return res;
7380 }
7382 if (res != VKFFT_SUCCESS) return res;
7383 if (sc->zeropad[0]) {
7384 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7385 res = VkAppendLine(sc);
7386 if (res != VKFFT_SUCCESS) return res;
7387
7388 if (sc->axisSwapped) {
7389 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7390 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
7391 }
7392 else {
7393 //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
7394 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
7395 }
7396 res = VkAppendLine(sc);
7397 if (res != VKFFT_SUCCESS) return res;
7398 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
7399 res = VkAppendLine(sc);
7400 if (res != VKFFT_SUCCESS) return res;
7401 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
7402 res = VkAppendLine(sc);
7403 if (res != VKFFT_SUCCESS) return res;
7404 sc->tempLen = sprintf(sc->tempStr, " }\n");
7405 res = VkAppendLine(sc);
7406 if (res != VKFFT_SUCCESS) return res;
7407 sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
7408 res = VkAppendLine(sc);
7409 if (res != VKFFT_SUCCESS) return res;
7410 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
7411 res = VkAppendLine(sc);
7412 if (res != VKFFT_SUCCESS) return res;
7413 sc->tempLen = sprintf(sc->tempStr, " }\n");
7414 res = VkAppendLine(sc);
7415 if (res != VKFFT_SUCCESS) return res;
7416 sc->tempLen = sprintf(sc->tempStr, " }\n");
7417 res = VkAppendLine(sc);
7418 if (res != VKFFT_SUCCESS) return res;
7419 }
7420 if (sc->zeropadBluestein[0]) {
7421 sc->tempLen = sprintf(sc->tempStr, " }\n");
7422 res = VkAppendLine(sc);
7423 if (res != VKFFT_SUCCESS) return res;
7424 }
7425 if (sc->axisSwapped) {
7426 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
7427 sc->tempLen = sprintf(sc->tempStr, " }");
7428 res = VkAppendLine(sc);
7429 if (res != VKFFT_SUCCESS) return res;
7430 }
7431 }
7432 else {
7433 if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
7434 sc->tempLen = sprintf(sc->tempStr, " }");
7435 res = VkAppendLine(sc);
7436 if (res != VKFFT_SUCCESS) return res;
7437 }
7438 }
7439
7440 }
7441 }
7442#endif
7443 res = appendBarrierVkFFT(sc, 1);
7444 if (res != VKFFT_SUCCESS) return res;
7445 res = appendZeropadStart(sc);
7446 if (res != VKFFT_SUCCESS) return res;
7447 if (sc->zeropadBluestein[0]) {
7448 if (sc->axisSwapped)
7449 maxBluesteinCutOff = sc->fftDim * sc->localSize[0];
7450 else
7451 maxBluesteinCutOff = sc->fftDim * sc->localSize[1];
7452 }
7453 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7454 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
7455
7456 if (sc->localSize[1] == 1)
7457 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
7458 else
7459 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7460 res = VkAppendLine(sc);
7461 if (res != VKFFT_SUCCESS) return res;
7462 if (sc->zeropadBluestein[0]) {
7463 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7464 res = VkAppendLine(sc);
7465 if (res != VKFFT_SUCCESS) return res;
7466 }
7467 if (sc->axisSwapped) {
7468 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
7469 }
7470 else {
7471 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
7472 }
7473 res = VkAppendLine(sc);
7474 if (res != VKFFT_SUCCESS) return res;
7475 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
7476 res = VkAppendLine(sc);
7477 if (res != VKFFT_SUCCESS) return res;
7478 if (sc->axisSwapped) {
7479 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w);
7480 }
7481 else {
7482 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-1].y;\n", sc->w);
7483 }
7484 res = VkAppendLine(sc);
7485 if (res != VKFFT_SUCCESS) return res;
7486 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w);
7487 res = VkAppendLine(sc);
7488 if (res != VKFFT_SUCCESS) return res;
7489 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
7490 res = VkAppendLine(sc);
7491 if (res != VKFFT_SUCCESS) return res;
7492 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
7493 res = VkAppendLine(sc);
7494 if (res != VKFFT_SUCCESS) return res;
7495 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7496 res = VkAppendLine(sc);
7497 if (res != VKFFT_SUCCESS) return res;
7498 sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7499 res = VkAppendLine(sc);
7500 if (res != VKFFT_SUCCESS) return res;
7501 if (sc->axisSwapped) {
7502 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim - 1, sc->fftDim);
7503 }
7504 else {
7505 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim - 1, sc->fftDim);
7506 }
7507 res = VkAppendLine(sc);
7508 if (res != VKFFT_SUCCESS) return res;
7509 sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7510 res = VkAppendLine(sc);
7511 if (res != VKFFT_SUCCESS) return res;
7512 sc->tempLen = sprintf(sc->tempStr, " }\n");
7513 res = VkAppendLine(sc);
7514 if (res != VKFFT_SUCCESS) return res;
7515 if (sc->zeropadBluestein[0]) {
7516 sc->tempLen = sprintf(sc->tempStr, " }\n");
7517 res = VkAppendLine(sc);
7518 if (res != VKFFT_SUCCESS) return res;
7519 }
7520
7521 /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", %s.x, %s.y, %s);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x);
7522 res = VkAppendLine(sc);
7523 if (res != VKFFT_SUCCESS) return res;*/
7524 }
7525 }
7526 res = appendZeropadEnd(sc);
7527 if (res != VKFFT_SUCCESS) return res;
7528 res = appendBarrierVkFFT(sc, 1);
7529 if (res != VKFFT_SUCCESS) return res;
7530 res = appendZeropadStart(sc);
7531 if (res != VKFFT_SUCCESS) return res;
7532 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7533 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
7534
7535 if (sc->localSize[1] == 1)
7536 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
7537 else
7538 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7539 res = VkAppendLine(sc);
7540 if (res != VKFFT_SUCCESS) return res;
7541 if (sc->zeropadBluestein[0]) {
7542 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7543 res = VkAppendLine(sc);
7544 if (res != VKFFT_SUCCESS) return res;
7545 }
7546 if (sc->axisSwapped) {
7547 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
7548 }
7549 else {
7550 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
7551 }
7552 res = VkAppendLine(sc);
7553 if (res != VKFFT_SUCCESS) return res;
7554 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
7555 res = VkAppendLine(sc);
7556 if (res != VKFFT_SUCCESS) return res;
7557 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7558 res = VkAppendLine(sc);
7559 if (res != VKFFT_SUCCESS) return res;
7560#if(VKFFT_BACKEND!=3)
7561 if (sc->axisSwapped) {
7562 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim);
7563 }
7564 else {
7565 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim);
7566 }
7567 res = VkAppendLine(sc);
7568 if (res != VKFFT_SUCCESS) return res;
7569 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7570 res = VkAppendLine(sc);
7571 if (res != VKFFT_SUCCESS) return res;
7572#endif
7573 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7574 res = VkAppendLine(sc);
7575 if (res != VKFFT_SUCCESS) return res;
7576 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7577 res = VkAppendLine(sc);
7578 if (res != VKFFT_SUCCESS) return res;
7579 sc->tempLen = sprintf(sc->tempStr, " }\n");
7580 res = VkAppendLine(sc);
7581 if (res != VKFFT_SUCCESS) return res;
7582 if (sc->zeropadBluestein[0]) {
7583 sc->tempLen = sprintf(sc->tempStr, " }\n");
7584 res = VkAppendLine(sc);
7585 if (res != VKFFT_SUCCESS) return res;
7586 }
7587 /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x);
7588 res = VkAppendLine(sc);
7589 if (res != VKFFT_SUCCESS) return res;*/
7590 }
7591 }
7592 res = appendZeropadEnd(sc);
7593 if (res != VKFFT_SUCCESS) return res;
7594 res = appendBarrierVkFFT(sc, 1);
7595 if (res != VKFFT_SUCCESS) return res;
7596#if(VKFFT_BACKEND==3)
7597 res = appendZeropadStart(sc);
7598 if (res != VKFFT_SUCCESS) return res;
7599 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7600 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
7601
7602 if (sc->localSize[1] == 1)
7603 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
7604 else
7605 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7606 res = VkAppendLine(sc);
7607 if (res != VKFFT_SUCCESS) return res;
7608 if (sc->zeropadBluestein[0]) {
7609 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
7610 res = VkAppendLine(sc);
7611 if (res != VKFFT_SUCCESS) return res;
7612 }
7613 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
7614 res = VkAppendLine(sc);
7615 if (res != VKFFT_SUCCESS) return res;
7616 if (sc->axisSwapped) {
7617 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim);
7618 }
7619 else {
7620 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim);
7621 }
7622 res = VkAppendLine(sc);
7623 if (res != VKFFT_SUCCESS) return res;
7624 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7625 res = VkAppendLine(sc);
7626 if (res != VKFFT_SUCCESS) return res;
7627 sc->tempLen = sprintf(sc->tempStr, " }\n");
7628 res = VkAppendLine(sc);
7629 if (res != VKFFT_SUCCESS) return res;
7630 if (sc->zeropadBluestein[0]) {
7631 sc->tempLen = sprintf(sc->tempStr, " }\n");
7632 res = VkAppendLine(sc);
7633 if (res != VKFFT_SUCCESS) return res;
7634 }
7635 /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x);
7636 res = VkAppendLine(sc);
7637 if (res != VKFFT_SUCCESS) return res;*/
7638 }
7639 }
7640 res = appendZeropadEnd(sc);
7641 if (res != VKFFT_SUCCESS) return res;
7642 res = appendBarrierVkFFT(sc, 1);
7643 if (res != VKFFT_SUCCESS) return res;
7644#endif
7645 res = appendZeropadStart(sc);
7646 if (res != VKFFT_SUCCESS) return res;
7647 uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
7648
7649 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7650 for (uint64_t i = 0; i < num_in; i++) {
7651
7652 if (sc->localSize[1] == 1)
7653 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
7654 else
7655 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
7656 res = VkAppendLine(sc);
7657 if (res != VKFFT_SUCCESS) return res;
7658 if (sc->axisSwapped) {
7659 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
7660 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
7661 res = VkAppendLine(sc);
7662 if (res != VKFFT_SUCCESS) return res;
7663 }
7664 }
7665 else {
7666 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1]) {
7667 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]);
7668 res = VkAppendLine(sc);
7669 if (res != VKFFT_SUCCESS) return res;
7670 }
7671 }
7672 if (sc->LUT) {
7673 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1);
7674 res = VkAppendLine(sc);
7675 if (res != VKFFT_SUCCESS) return res;
7676 }
7677 else {
7678 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
7679 res = VkAppendLine(sc);
7680 if (res != VKFFT_SUCCESS) return res;
7681 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
7682 res = VkAppendLine(sc);
7683 if (res != VKFFT_SUCCESS) return res;
7684 }
7685
7686 if (sc->axisSwapped) {
7687 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
7688 res = VkAppendLine(sc);
7689 if (res != VKFFT_SUCCESS) return res;
7690 }
7691 else {
7692 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
7693 res = VkAppendLine(sc);
7694 if (res != VKFFT_SUCCESS) return res;
7695 }
7696 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]);
7697 res = VkAppendLine(sc);
7698 if (res != VKFFT_SUCCESS) return res;
7699
7700 sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
7701 res = VkAppendLine(sc);
7702 if (res != VKFFT_SUCCESS) return res;
7703
7704 if (sc->axisSwapped) {
7705 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
7706 res = VkAppendLine(sc);
7707 if (res != VKFFT_SUCCESS) return res;
7708 }
7709 else {
7710 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
7711 res = VkAppendLine(sc);
7712 if (res != VKFFT_SUCCESS) return res;
7713 }
7714 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]);
7715 res = VkAppendLine(sc);
7716 if (res != VKFFT_SUCCESS) return res;
7717
7718 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
7719 res = VkAppendLine(sc);
7720 if (res != VKFFT_SUCCESS) return res;
7721
7722 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
7723 res = VkAppendLine(sc);
7724 if (res != VKFFT_SUCCESS) return res;
7725 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
7726 res = VkAppendLine(sc);
7727 if (res != VKFFT_SUCCESS) return res;
7728 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
7729 res = VkAppendLine(sc);
7730 if (res != VKFFT_SUCCESS) return res;
7731
7732 sc->tempLen = sprintf(sc->tempStr, " } \n");
7733 res = VkAppendLine(sc);
7734
7735 if (res != VKFFT_SUCCESS) return res;
7736 sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " == 0){\n", sc->fftDim / 2 + 1);
7737 res = VkAppendLine(sc);
7738 if (res != VKFFT_SUCCESS) return res;
7739 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
7740 res = VkAppendLine(sc);
7741 if (res != VKFFT_SUCCESS) return res;
7742 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
7743 res = VkAppendLine(sc);
7744 if (res != VKFFT_SUCCESS) return res;
7745 sc->tempLen = sprintf(sc->tempStr, " }\n");
7746 res = VkAppendLine(sc);
7747 if (res != VKFFT_SUCCESS) return res;
7748 if (sc->axisSwapped) {
7749 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
7750 sc->tempLen = sprintf(sc->tempStr, " }\n");
7751 res = VkAppendLine(sc);
7752 if (res != VKFFT_SUCCESS) return res;
7753 }
7754 }
7755 else {
7756 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1]) {
7757 sc->tempLen = sprintf(sc->tempStr, " }\n");
7758 res = VkAppendLine(sc);
7759 if (res != VKFFT_SUCCESS) return res;
7760 }
7761 }
7762 }
7763 }
7764 res = appendZeropadEnd(sc);
7765 if (res != VKFFT_SUCCESS) return res;
7766 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
7767 }
7768 else {
7769 //Not implemented
7770 }
7771 break;
7772 }
7773 case 143://DCT-IV strided as 2xN/2 DCT-II
7774 {
7775 char shiftX[500] = "";
7776 if (sc->performWorkGroupShift[0])
7777 sprintf(shiftX, " + consts.workGroupShiftX ");
7778 char shiftX2[500] = "";
7779 if (sc->performWorkGroupShift[0])
7780 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
7781 char shiftY[500] = "";
7782 if (sc->performWorkGroupShift[1])
7783 sprintf(shiftY, " + consts.workGroupShiftY ");
7784 if (sc->fftDim == sc->fft_dim_full) {
7785 if (sc->zeropadBluestein[0]) {
7786 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
7787 if (res != VKFFT_SUCCESS) return res;
7788 res = appendBarrierVkFFT(sc, 1);
7789 if (res != VKFFT_SUCCESS) return res;
7791 }
7792 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7793 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
7794
7795 if (sc->localSize[1] == 1)
7796 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
7797 else
7798 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7799 res = VkAppendLine(sc);
7800 if (res != VKFFT_SUCCESS) return res;
7801 if (sc->zeropadBluestein[0]) {
7802 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
7803 res = VkAppendLine(sc);
7804 if (res != VKFFT_SUCCESS) return res;
7805 }
7806 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
7807 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
7808 res = VkAppendLine(sc);
7809 if (res != VKFFT_SUCCESS) return res;
7810 }
7811 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
7812 res = VkAppendLine(sc);
7813 if (res != VKFFT_SUCCESS) return res;
7814 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
7815 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[1]);
7816 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7817 if (res != VKFFT_SUCCESS) return res;
7818 sc->tempLen = sprintf(sc->tempStr, ";\n");
7819 res = VkAppendLine(sc);
7820 if (res != VKFFT_SUCCESS) return res;
7821 if (sc->zeropad[0]) {
7822 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
7823 res = VkAppendLine(sc);
7824 if (res != VKFFT_SUCCESS) return res;
7825 }
7826
7828 if (res != VKFFT_SUCCESS) return res;
7829#if(VKFFT_BACKEND!=3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
7830 if (sc->inputBufferBlockNum == 1)
7831 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7832 else
7833 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7834 res = VkAppendLine(sc);
7835 if (res != VKFFT_SUCCESS) return res;
7836#else
7837 if (i < sc->min_registers_per_thread) {
7838 if (sc->inputBufferBlockNum == 1)
7839 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7840 else
7841 sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7842 res = VkAppendLine(sc);
7843 if (res != VKFFT_SUCCESS) return res;
7844 }
7845 else {
7846 if (sc->inputBufferBlockNum == 1)
7847 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
7848 else
7849 sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
7850 res = VkAppendLine(sc);
7851 if (res != VKFFT_SUCCESS) return res;
7852 }
7853#endif
7854#if(VKFFT_BACKEND!=3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around: we do writes in a separate stage
7855 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
7856
7857 res = VkAppendLine(sc);
7858 if (res != VKFFT_SUCCESS) return res;
7859 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
7860 res = VkAppendLine(sc);
7861 if (res != VKFFT_SUCCESS) return res;
7862 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
7863 res = VkAppendLine(sc);
7864 if (res != VKFFT_SUCCESS) return res;
7865 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
7866 res = VkAppendLine(sc);
7867 if (res != VKFFT_SUCCESS) return res;
7868 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
7869 res = VkAppendLine(sc);
7870 if (res != VKFFT_SUCCESS) return res;
7871 sc->tempLen = sprintf(sc->tempStr, " }\n");
7872 res = VkAppendLine(sc);
7873 if (res != VKFFT_SUCCESS) return res;
7874#endif
7876 if (res != VKFFT_SUCCESS) return res;
7877 if (sc->zeropad[0]) {
7878 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7879 res = VkAppendLine(sc);
7880 if (res != VKFFT_SUCCESS) return res;
7881
7882 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
7883 res = VkAppendLine(sc);
7884 if (res != VKFFT_SUCCESS) return res;
7885 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
7886 res = VkAppendLine(sc);
7887 if (res != VKFFT_SUCCESS) return res;
7888 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
7889 res = VkAppendLine(sc);
7890 if (res != VKFFT_SUCCESS) return res;
7891 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
7892 res = VkAppendLine(sc);
7893 if (res != VKFFT_SUCCESS) return res;
7894 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
7895 res = VkAppendLine(sc);
7896 if (res != VKFFT_SUCCESS) return res;
7897 sc->tempLen = sprintf(sc->tempStr, " }\n");
7898 res = VkAppendLine(sc);
7899 if (res != VKFFT_SUCCESS) return res;
7900 sc->tempLen = sprintf(sc->tempStr, " }\n");
7901 res = VkAppendLine(sc);
7902 if (res != VKFFT_SUCCESS) return res;
7903 }
7904 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
7905 sc->tempLen = sprintf(sc->tempStr, " }");
7906 res = VkAppendLine(sc);
7907 if (res != VKFFT_SUCCESS) return res;
7908 }
7909 if (sc->zeropadBluestein[0]) {
7910 sc->tempLen = sprintf(sc->tempStr, " }\n");
7911 res = VkAppendLine(sc);
7912 if (res != VKFFT_SUCCESS) return res;
7913 }
7914 }
7915 }
7916#if(VKFFT_BACKEND==3)//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
7917 for (uint64_t k = 0; k < sc->registerBoost; k++) {
7918 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
7919
7920 if (sc->localSize[1] == 1)
7921 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
7922 else
7923 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
7924 res = VkAppendLine(sc);
7925 if (res != VKFFT_SUCCESS) return res;
7926 if (sc->zeropadBluestein[0]) {
7927 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
7928 res = VkAppendLine(sc);
7929 if (res != VKFFT_SUCCESS) return res;
7930 }
7931 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
7932 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
7933 res = VkAppendLine(sc);
7934 if (res != VKFFT_SUCCESS) return res;
7935 }
7936 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
7937 res = VkAppendLine(sc);
7938 if (res != VKFFT_SUCCESS) return res;
7939 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
7940 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[1]);
7941 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7942 if (res != VKFFT_SUCCESS) return res;
7943 sc->tempLen = sprintf(sc->tempStr, ";\n");
7944 res = VkAppendLine(sc);
7945 if (res != VKFFT_SUCCESS) return res;
7946 if (sc->zeropad[0]) {
7947 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
7948 res = VkAppendLine(sc);
7949 if (res != VKFFT_SUCCESS) return res;
7950 }
7951
7953 if (res != VKFFT_SUCCESS) return res;
7954 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
7955
7956 res = VkAppendLine(sc);
7957 if (res != VKFFT_SUCCESS) return res;
7958 if (i < sc->min_registers_per_thread) {
7959 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
7960 res = VkAppendLine(sc);
7961 if (res != VKFFT_SUCCESS) return res;
7962 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
7963 res = VkAppendLine(sc);
7964 if (res != VKFFT_SUCCESS) return res;
7965 sc->tempLen = sprintf(sc->tempStr, " }\n");
7966 res = VkAppendLine(sc);
7967 if (res != VKFFT_SUCCESS) return res;
7968 }
7969 else {
7970 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
7971 res = VkAppendLine(sc);
7972 if (res != VKFFT_SUCCESS) return res;
7973 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread]);
7974 res = VkAppendLine(sc);
7975 if (res != VKFFT_SUCCESS) return res;
7976 sc->tempLen = sprintf(sc->tempStr, " }\n");
7977 res = VkAppendLine(sc);
7978 if (res != VKFFT_SUCCESS) return res;
7979 }
7981 if (res != VKFFT_SUCCESS) return res;
7982 if (sc->zeropad[0]) {
7983 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
7984 res = VkAppendLine(sc);
7985 if (res != VKFFT_SUCCESS) return res;
7986
7987 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
7988 res = VkAppendLine(sc);
7989 if (res != VKFFT_SUCCESS) return res;
7990 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
7991 res = VkAppendLine(sc);
7992 if (res != VKFFT_SUCCESS) return res;
7993 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
7994 res = VkAppendLine(sc);
7995 if (res != VKFFT_SUCCESS) return res;
7996 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
7997 res = VkAppendLine(sc);
7998 if (res != VKFFT_SUCCESS) return res;
7999 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8000 res = VkAppendLine(sc);
8001 if (res != VKFFT_SUCCESS) return res;
8002 sc->tempLen = sprintf(sc->tempStr, " }\n");
8003 res = VkAppendLine(sc);
8004 if (res != VKFFT_SUCCESS) return res;
8005 sc->tempLen = sprintf(sc->tempStr, " }\n");
8006 res = VkAppendLine(sc);
8007 if (res != VKFFT_SUCCESS) return res;
8008 }
8009 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
8010 sc->tempLen = sprintf(sc->tempStr, " }");
8011 res = VkAppendLine(sc);
8012 if (res != VKFFT_SUCCESS) return res;
8013 }
8014 if (sc->zeropadBluestein[0]) {
8015 sc->tempLen = sprintf(sc->tempStr, " }\n");
8016 res = VkAppendLine(sc);
8017 if (res != VKFFT_SUCCESS) return res;
8018 }
8019 }
8020 }
8021 res = appendBarrierVkFFT(sc, 1);
8022 if (res != VKFFT_SUCCESS) return res;
8023 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8024 for (uint64_t i = 0; i < 2 * sc->min_registers_per_thread; i++) {
8025
8026 if (sc->localSize[1] == 1)
8027 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0]);
8028 else
8029 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8030 res = VkAppendLine(sc);
8031 if (res != VKFFT_SUCCESS) return res;
8032 if (sc->zeropadBluestein[0]) {
8033 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
8034 res = VkAppendLine(sc);
8035 if (res != VKFFT_SUCCESS) return res;
8036 }
8037 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
8038 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
8039 res = VkAppendLine(sc);
8040 if (res != VKFFT_SUCCESS) return res;
8041 }
8042 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
8043 res = VkAppendLine(sc);
8044 if (res != VKFFT_SUCCESS) return res;
8045 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
8046 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[1]);
8047 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8048 if (res != VKFFT_SUCCESS) return res;
8049 sc->tempLen = sprintf(sc->tempStr, ";\n");
8050 res = VkAppendLine(sc);
8051 if (res != VKFFT_SUCCESS) return res;
8052 if (sc->zeropad[0]) {
8053 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
8054 res = VkAppendLine(sc);
8055 if (res != VKFFT_SUCCESS) return res;
8056 }
8057
8059 if (res != VKFFT_SUCCESS) return res;
8060 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
8061
8062 res = VkAppendLine(sc);
8063 if (res != VKFFT_SUCCESS) return res;
8064 if (i < sc->min_registers_per_thread) {
8065 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]);
8066 res = VkAppendLine(sc);
8067 if (res != VKFFT_SUCCESS) return res;
8068 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8069 res = VkAppendLine(sc);
8070 if (res != VKFFT_SUCCESS) return res;
8071 sc->tempLen = sprintf(sc->tempStr, " }\n");
8072 res = VkAppendLine(sc);
8073 if (res != VKFFT_SUCCESS) return res;
8074 }
8075 else {
8076 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]);
8077 res = VkAppendLine(sc);
8078 if (res != VKFFT_SUCCESS) return res;
8079 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - sc->min_registers_per_thread + k * sc->registers_per_thread]);
8080 res = VkAppendLine(sc);
8081 if (res != VKFFT_SUCCESS) return res;
8082 sc->tempLen = sprintf(sc->tempStr, " }\n");
8083 res = VkAppendLine(sc);
8084 if (res != VKFFT_SUCCESS) return res;
8085 }
8087 if (res != VKFFT_SUCCESS) return res;
8088 if (sc->zeropad[0]) {
8089 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
8090 res = VkAppendLine(sc);
8091 if (res != VKFFT_SUCCESS) return res;
8092
8093 sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
8094 res = VkAppendLine(sc);
8095 if (res != VKFFT_SUCCESS) return res;
8096 sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
8097 res = VkAppendLine(sc);
8098 if (res != VKFFT_SUCCESS) return res;
8099 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
8100 res = VkAppendLine(sc);
8101 if (res != VKFFT_SUCCESS) return res;
8102 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
8103 res = VkAppendLine(sc);
8104 if (res != VKFFT_SUCCESS) return res;
8105 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8106 res = VkAppendLine(sc);
8107 if (res != VKFFT_SUCCESS) return res;
8108 sc->tempLen = sprintf(sc->tempStr, " }\n");
8109 res = VkAppendLine(sc);
8110 if (res != VKFFT_SUCCESS) return res;
8111 sc->tempLen = sprintf(sc->tempStr, " }\n");
8112 res = VkAppendLine(sc);
8113 if (res != VKFFT_SUCCESS) return res;
8114 }
8115 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
8116 sc->tempLen = sprintf(sc->tempStr, " }");
8117 res = VkAppendLine(sc);
8118 if (res != VKFFT_SUCCESS) return res;
8119 }
8120 if (sc->zeropadBluestein[0]) {
8121 sc->tempLen = sprintf(sc->tempStr, " }\n");
8122 res = VkAppendLine(sc);
8123 if (res != VKFFT_SUCCESS) return res;
8124 }
8125 }
8126 }
8127#endif
8128 res = appendBarrierVkFFT(sc, 1);
8129 if (res != VKFFT_SUCCESS) return res;
8130 res = appendZeropadStart(sc);
8131 if (res != VKFFT_SUCCESS) return res;
8132 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8133 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8134
8135 if (sc->localSize[1] == 1)
8136 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
8137 else
8138 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8139 res = VkAppendLine(sc);
8140 if (res != VKFFT_SUCCESS) return res;
8141 if (sc->zeropadBluestein[0]) {
8142 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
8143 res = VkAppendLine(sc);
8144 if (res != VKFFT_SUCCESS) return res;
8145 }
8146 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
8147
8148 res = VkAppendLine(sc);
8149 if (res != VKFFT_SUCCESS) return res;
8150 sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
8151 res = VkAppendLine(sc);
8152 if (res != VKFFT_SUCCESS) return res;
8153 sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w);
8154
8155 res = VkAppendLine(sc);
8156 if (res != VKFFT_SUCCESS) return res;
8157 sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w);
8158 res = VkAppendLine(sc);
8159 if (res != VKFFT_SUCCESS) return res;
8160 sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
8161 res = VkAppendLine(sc);
8162 if (res != VKFFT_SUCCESS) return res;
8163 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
8164 res = VkAppendLine(sc);
8165 if (res != VKFFT_SUCCESS) return res;
8166 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
8167 res = VkAppendLine(sc);
8168 if (res != VKFFT_SUCCESS) return res;
8169 sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8170 res = VkAppendLine(sc);
8171 if (res != VKFFT_SUCCESS) return res;
8172 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim - 1, sc->localSize[0]);
8173
8174 res = VkAppendLine(sc);
8175 if (res != VKFFT_SUCCESS) return res;
8176 sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8177 res = VkAppendLine(sc);
8178 if (res != VKFFT_SUCCESS) return res;
8179 sc->tempLen = sprintf(sc->tempStr, " }\n");
8180 res = VkAppendLine(sc);
8181 if (res != VKFFT_SUCCESS) return res;
8182 if (sc->zeropadBluestein[0]) {
8183 sc->tempLen = sprintf(sc->tempStr, " }\n");
8184 res = VkAppendLine(sc);
8185 if (res != VKFFT_SUCCESS) return res;
8186 }
8187 //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f\\n\", %s.x, %s.y);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8188 //res = VkAppendLine(sc);
8189 //if (res != VKFFT_SUCCESS) return res;
8190 }
8191 }
8192 res = appendZeropadEnd(sc);
8193 if (res != VKFFT_SUCCESS) return res;
8194 res = appendBarrierVkFFT(sc, 1);
8195 if (res != VKFFT_SUCCESS) return res;
8196 res = appendZeropadStart(sc);
8197 if (res != VKFFT_SUCCESS) return res;
8198 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8199 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8200
8201 if (sc->localSize[1] == 1)
8202 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
8203 else
8204 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8205 res = VkAppendLine(sc);
8206 if (res != VKFFT_SUCCESS) return res;
8207 if (sc->zeropadBluestein[0]) {
8208 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
8209 res = VkAppendLine(sc);
8210 if (res != VKFFT_SUCCESS) return res;
8211 }
8212 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
8213
8214 res = VkAppendLine(sc);
8215 if (res != VKFFT_SUCCESS) return res;
8216 sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
8217 res = VkAppendLine(sc);
8218 if (res != VKFFT_SUCCESS) return res;
8219 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8220 res = VkAppendLine(sc);
8221 if (res != VKFFT_SUCCESS) return res;
8222#if(VKFFT_BACKEND!=3)
8223 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);
8224
8225 res = VkAppendLine(sc);
8226 if (res != VKFFT_SUCCESS) return res;
8227 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8228 res = VkAppendLine(sc);
8229 if (res != VKFFT_SUCCESS) return res;
8230#endif
8231 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
8232 res = VkAppendLine(sc);
8233 if (res != VKFFT_SUCCESS) return res;
8234 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8235 res = VkAppendLine(sc);
8236 if (res != VKFFT_SUCCESS) return res;
8237 sc->tempLen = sprintf(sc->tempStr, " }\n");
8238 res = VkAppendLine(sc);
8239 if (res != VKFFT_SUCCESS) return res;
8240 if (sc->zeropadBluestein[0]) {
8241 sc->tempLen = sprintf(sc->tempStr, " }\n");
8242 res = VkAppendLine(sc);
8243 if (res != VKFFT_SUCCESS) return res;
8244 }
8245 }
8246 }
8247 res = appendZeropadEnd(sc);
8248 if (res != VKFFT_SUCCESS) return res;
8249 res = appendBarrierVkFFT(sc, 1);
8250 if (res != VKFFT_SUCCESS) return res;
8251#if(VKFFT_BACKEND==3)
8252 res = appendZeropadStart(sc);
8253 if (res != VKFFT_SUCCESS) return res;
8254 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8255 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8256
8257 if (sc->localSize[1] == 1)
8258 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
8259 else
8260 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8261 res = VkAppendLine(sc);
8262 if (res != VKFFT_SUCCESS) return res;
8263 if (sc->zeropadBluestein[0]) {
8264 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
8265 res = VkAppendLine(sc);
8266 if (res != VKFFT_SUCCESS) return res;
8267 }
8268 sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
8269 res = VkAppendLine(sc);
8270 if (res != VKFFT_SUCCESS) return res;
8271 sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);
8272
8273 res = VkAppendLine(sc);
8274 if (res != VKFFT_SUCCESS) return res;
8275 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
8276 res = VkAppendLine(sc);
8277 if (res != VKFFT_SUCCESS) return res;
8278 sc->tempLen = sprintf(sc->tempStr, " }\n");
8279 res = VkAppendLine(sc);
8280 if (res != VKFFT_SUCCESS) return res;
8281 if (sc->zeropadBluestein[0]) {
8282 sc->tempLen = sprintf(sc->tempStr, " }\n");
8283 res = VkAppendLine(sc);
8284 if (res != VKFFT_SUCCESS) return res;
8285 }
8286 }
8287 }
8288 res = appendZeropadEnd(sc);
8289 if (res != VKFFT_SUCCESS) return res;
8290 res = appendBarrierVkFFT(sc, 1);
8291 if (res != VKFFT_SUCCESS) return res;
8292#endif
8293 res = appendZeropadStart(sc);
8294 if (res != VKFFT_SUCCESS) return res;
8295 uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]);
8296
8297 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8298 for (uint64_t i = 0; i < num_in; i++) {
8299
8300 if (sc->localSize[1] == 1)
8301 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
8302 else
8303 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
8304 res = VkAppendLine(sc);
8305 if (res != VKFFT_SUCCESS) return res;
8306
8307 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
8308 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
8309 res = VkAppendLine(sc);
8310 if (res != VKFFT_SUCCESS) return res;
8311 }
8312 if (sc->LUT) {
8313 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]);
8314 res = VkAppendLine(sc);
8315 if (res != VKFFT_SUCCESS) return res;
8316 }
8317 else {
8318 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (combinedID / %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]);
8319 res = VkAppendLine(sc);
8320 if (res != VKFFT_SUCCESS) return res;
8321 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (combinedID / %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]);
8322 res = VkAppendLine(sc);
8323 if (res != VKFFT_SUCCESS) return res;
8324 }
8325
8326 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
8327 res = VkAppendLine(sc);
8328 if (res != VKFFT_SUCCESS) return res;
8329
8330 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]);
8331 res = VkAppendLine(sc);
8332 if (res != VKFFT_SUCCESS) return res;
8333
8334 sc->tempLen = sprintf(sc->tempStr, " if (combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]);
8335 res = VkAppendLine(sc);
8336 if (res != VKFFT_SUCCESS) return res;
8337
8338 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);
8339 res = VkAppendLine(sc);
8340 if (res != VKFFT_SUCCESS) return res;
8341 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]);
8342 res = VkAppendLine(sc);
8343 if (res != VKFFT_SUCCESS) return res;
8344
8345 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
8346 res = VkAppendLine(sc);
8347 if (res != VKFFT_SUCCESS) return res;
8348
8349 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
8350 res = VkAppendLine(sc);
8351 if (res != VKFFT_SUCCESS) return res;
8352 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
8353 res = VkAppendLine(sc);
8354 if (res != VKFFT_SUCCESS) return res;
8355 sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
8356 res = VkAppendLine(sc);
8357 if (res != VKFFT_SUCCESS) return res;
8358
8359 sc->tempLen = sprintf(sc->tempStr, " } else {\n");
8360 res = VkAppendLine(sc);
8361
8362 if (res != VKFFT_SUCCESS) return res;
8363
8364 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
8365 res = VkAppendLine(sc);
8366 if (res != VKFFT_SUCCESS) return res;
8367 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
8368 res = VkAppendLine(sc);
8369 if (res != VKFFT_SUCCESS) return res;
8370 sc->tempLen = sprintf(sc->tempStr, " }\n");
8371 res = VkAppendLine(sc);
8372 if (res != VKFFT_SUCCESS) return res;
8373 if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
8374 sc->tempLen = sprintf(sc->tempStr, " }\n");
8375 res = VkAppendLine(sc);
8376 if (res != VKFFT_SUCCESS) return res;
8377 }
8378 }
8379 }
8380 res = appendZeropadEnd(sc);
8381 if (res != VKFFT_SUCCESS) return res;
8382 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
8383 }
8384 else {
8385 //Not implemented
8386 }
8387 break;
8388 }
8389 case 144://odd DCT-IV nonstrided as N FFT
8390 {
8391 char shiftX[500] = "";
8392 if (sc->performWorkGroupShift[0])
8393 sprintf(shiftX, " + consts.workGroupShiftX ");
8394 char shiftY[500] = "";
8395 if (sc->performWorkGroupShift[1])
8396 sprintf(shiftY, " + consts.workGroupShiftY ");
8397 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
8398 if (sc->fftDim == sc->fft_dim_full) {
8399 if (sc->zeropadBluestein[0]) {
8400 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
8401 if (res != VKFFT_SUCCESS) return res;
8402 res = appendBarrierVkFFT(sc, 1);
8403 if (res != VKFFT_SUCCESS) return res;
8405 }
8406 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8407 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8408
8409 if (sc->localSize[1] == 1)
8410 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
8411 else
8412 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8413 res = VkAppendLine(sc);
8414 if (res != VKFFT_SUCCESS) return res;
8415 if (sc->inputStride[0] > 1)
8416 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
8417 else
8418 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
8419 res = VkAppendLine(sc);
8420 if (res != VKFFT_SUCCESS) return res;
8421 if (sc->axisSwapped) {
8422 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
8423 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
8424 res = VkAppendLine(sc);
8425 if (res != VKFFT_SUCCESS) return res;
8426 }
8427 if (sc->zeropadBluestein[0]) {
8428 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
8429 res = VkAppendLine(sc);
8430 if (res != VKFFT_SUCCESS) return res;
8431 }
8432 }
8433 else {
8434 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
8435 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
8436 res = VkAppendLine(sc);
8437 if (res != VKFFT_SUCCESS) return res;
8438 }
8439 if (sc->zeropadBluestein[0]) {
8440 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
8441 res = VkAppendLine(sc);
8442 if (res != VKFFT_SUCCESS) return res;
8443 }
8444 }
8445 if (sc->zeropad[0]) {
8446 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
8447 res = VkAppendLine(sc);
8448 if (res != VKFFT_SUCCESS) return res;
8449 }
8450 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
8451 res = VkAppendLine(sc);
8452 if (res != VKFFT_SUCCESS) return res;
8453 indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
8454 if (res != VKFFT_SUCCESS) return res;
8455 sc->tempLen = sprintf(sc->tempStr, ";\n");
8456 res = VkAppendLine(sc);
8457 if (res != VKFFT_SUCCESS) return res;
8459 if (res != VKFFT_SUCCESS) return res;
8460 if (sc->axisSwapped) {
8461 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
8462 res = VkAppendLine(sc);
8463 if (res != VKFFT_SUCCESS) return res;
8464
8465 if (sc->inputBufferBlockNum == 1)
8466 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
8467 else
8468 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
8469 res = VkAppendLine(sc);
8470 if (res != VKFFT_SUCCESS) return res;
8471
8472 if (sc->mergeSequencesR2C) {
8473 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
8474 res = VkAppendLine(sc);
8475 if (res != VKFFT_SUCCESS) return res;
8476
8477 if (sc->inputBufferBlockNum == 1)
8478 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8479 else
8480 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
8481 res = VkAppendLine(sc);
8482 if (res != VKFFT_SUCCESS) return res;
8483 }
8484 else {
8485 if (sc->inputBufferBlockNum == 1)
8486 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8487 else
8488 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8489 res = VkAppendLine(sc);
8490 if (res != VKFFT_SUCCESS) return res;
8491 }
8492 }
8493 else {
8494 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim);
8495 res = VkAppendLine(sc);
8496 if (res != VKFFT_SUCCESS) return res;
8497 if (sc->inputBufferBlockNum == 1)
8498 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8499 else
8500 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
8501 res = VkAppendLine(sc);
8502 if (res != VKFFT_SUCCESS) return res;
8503 if (sc->mergeSequencesR2C) {
8504 sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
8505 res = VkAppendLine(sc);
8506 if (res != VKFFT_SUCCESS) return res;
8507 if (sc->inputBufferBlockNum == 1)
8508 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8509 else
8510 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
8511 res = VkAppendLine(sc);
8512 if (res != VKFFT_SUCCESS) return res;
8513 }
8514 else {
8515 if (sc->inputBufferBlockNum == 1)
8516 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8517 else
8518 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8519 res = VkAppendLine(sc);
8520 if (res != VKFFT_SUCCESS) return res;
8521 }
8522 }
8524 if (res != VKFFT_SUCCESS) return res;
8525 if (sc->zeropadBluestein[0]) {
8526 sc->tempLen = sprintf(sc->tempStr, " }\n");
8527 res = VkAppendLine(sc);
8528 if (res != VKFFT_SUCCESS) return res;
8529 }
8530 if (sc->zeropad[0]) {
8531 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
8532 res = VkAppendLine(sc);
8533 if (res != VKFFT_SUCCESS) return res;
8534
8535 if (sc->axisSwapped) {
8536 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
8537 res = VkAppendLine(sc);
8538 if (res != VKFFT_SUCCESS) return res;
8539 }
8540 else {
8541 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim);
8542 res = VkAppendLine(sc);
8543 if (res != VKFFT_SUCCESS) return res;
8544 }
8545
8546 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
8547 res = VkAppendLine(sc);
8548 if (res != VKFFT_SUCCESS) return res;
8549 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8550 res = VkAppendLine(sc);
8551 if (res != VKFFT_SUCCESS) return res;
8552 sc->tempLen = sprintf(sc->tempStr, " }\n");
8553 res = VkAppendLine(sc);
8554 if (res != VKFFT_SUCCESS) return res;
8555 }
8556 if (sc->axisSwapped) {
8557 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
8558 sc->tempLen = sprintf(sc->tempStr, " }");
8559 res = VkAppendLine(sc);
8560 if (res != VKFFT_SUCCESS) return res;
8561 }
8562 }
8563 else {
8564 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
8565 sc->tempLen = sprintf(sc->tempStr, " }");
8566 res = VkAppendLine(sc);
8567 if (res != VKFFT_SUCCESS) return res;
8568 }
8569 }
8570 }
8571 }
8572 res = appendBarrierVkFFT(sc, 1);
8573 if (res != VKFFT_SUCCESS) return res;
8574 res = appendZeropadStart(sc);
8575 if (res != VKFFT_SUCCESS) return res;
8576 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8577 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8578 if (!sc->axisSwapped) {
8579 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
8580 res = VkAppendLine(sc);
8581 if (res != VKFFT_SUCCESS) return res;
8582 if (sc->axisSwapped) {
8583 if (sc->zeropadBluestein[0]) {
8584 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
8585 res = VkAppendLine(sc);
8586 if (res != VKFFT_SUCCESS) return res;
8587 }
8588 }
8589 else {
8590 if (sc->zeropadBluestein[0]) {
8591 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
8592 res = VkAppendLine(sc);
8593 if (res != VKFFT_SUCCESS) return res;
8594 }
8595 }
8596 sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * (combinedID %% %" PRIu64 ");\n", sc->fftDim / 2, sc->fftDim);
8597 res = VkAppendLine(sc);
8598 if (res != VKFFT_SUCCESS) return res;
8599
8600 sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
8601 res = VkAppendLine(sc);
8602 if (res != VKFFT_SUCCESS) return res;
8603 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
8604 res = VkAppendLine(sc);
8605 if (res != VKFFT_SUCCESS) return res;
8606 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
8607 res = VkAppendLine(sc);
8608 if (res != VKFFT_SUCCESS) return res;
8609 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
8610 res = VkAppendLine(sc);
8611 if (res != VKFFT_SUCCESS) return res;
8612 sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
8613 res = VkAppendLine(sc);
8614 if (res != VKFFT_SUCCESS) return res;
8615 sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID + %s * sharedStride;\n", sc->gl_LocalInvocationID_y);
8616 res = VkAppendLine(sc);
8617 if (res != VKFFT_SUCCESS) return res;
8618 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
8619 res = VkAppendLine(sc);
8620 if (res != VKFFT_SUCCESS) return res;
8621 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8622 %s.x = -%s.x;\n\
8623 %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8624 res = VkAppendLine(sc);
8625 if (res != VKFFT_SUCCESS) return res;
8626 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8627 %s.x = -%s.x;\n\
8628 %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8629 res = VkAppendLine(sc);
8630 if (res != VKFFT_SUCCESS) return res;
8631 if (sc->zeropadBluestein[0]) {
8632 sc->tempLen = sprintf(sc->tempStr, " }\n");
8633 res = VkAppendLine(sc);
8634 if (res != VKFFT_SUCCESS) return res;
8635 }
8636 }
8637 else {
8638 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
8639 res = VkAppendLine(sc);
8640 if (res != VKFFT_SUCCESS) return res;
8641 if (sc->axisSwapped) {
8642 if (sc->zeropadBluestein[0]) {
8643 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim);
8644 res = VkAppendLine(sc);
8645 if (res != VKFFT_SUCCESS) return res;
8646 }
8647 }
8648 else {
8649 if (sc->zeropadBluestein[0]) {
8650 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim);
8651 res = VkAppendLine(sc);
8652 if (res != VKFFT_SUCCESS) return res;
8653 }
8654 }
8655 sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2);
8656 res = VkAppendLine(sc);
8657 if (res != VKFFT_SUCCESS) return res;
8658
8659 sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
8660 res = VkAppendLine(sc);
8661 if (res != VKFFT_SUCCESS) return res;
8662 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
8663 res = VkAppendLine(sc);
8664 if (res != VKFFT_SUCCESS) return res;
8665 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
8666 res = VkAppendLine(sc);
8667 if (res != VKFFT_SUCCESS) return res;
8668 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
8669 res = VkAppendLine(sc);
8670 if (res != VKFFT_SUCCESS) return res;
8671 sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
8672 res = VkAppendLine(sc);
8673 if (res != VKFFT_SUCCESS) return res;
8674 sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);
8675 res = VkAppendLine(sc);
8676 if (res != VKFFT_SUCCESS) return res;
8677 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
8678 res = VkAppendLine(sc);
8679 if (res != VKFFT_SUCCESS) return res;
8680 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8681 %s.x = -%s.x;\n\
8682 %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8683 res = VkAppendLine(sc);
8684 if (res != VKFFT_SUCCESS) return res;
8685 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8686 %s.x = -%s.x;\n\
8687 %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8688 res = VkAppendLine(sc);
8689 if (res != VKFFT_SUCCESS) return res;
8690 if (sc->zeropadBluestein[0]) {
8691 sc->tempLen = sprintf(sc->tempStr, " }\n");
8692 res = VkAppendLine(sc);
8693 if (res != VKFFT_SUCCESS) return res;
8694 }
8695 }
8696 }
8697 }
8698 res = appendZeropadEnd(sc);
8699 if (res != VKFFT_SUCCESS) return res;
8700 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
8701 }
8702 else {
8703 //Not implemented
8704 }
8705 break;
8706 }
8707 case 145://odd DCT-IV strided as N FFT
8708 {
8709 char shiftX[500] = "";
8710 if (sc->performWorkGroupShift[0])
8711 sprintf(shiftX, " + consts.workGroupShiftX ");
8712 char shiftX2[500] = "";
8713 if (sc->performWorkGroupShift[0])
8714 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
8715 char shiftY[500] = "";
8716 if (sc->performWorkGroupShift[1])
8717 sprintf(shiftY, " + consts.workGroupShiftY ");
8718 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
8719 if (sc->fftDim == sc->fft_dim_full) {
8720 if (sc->zeropadBluestein[0]) {
8721 res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
8722 if (res != VKFFT_SUCCESS) return res;
8723 res = appendBarrierVkFFT(sc, 1);
8724 if (res != VKFFT_SUCCESS) return res;
8726 }
8727 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8728 for (uint64_t i = 0; i < mult * sc->min_registers_per_thread; i++) {
8729
8730 //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
8731 //res = VkAppendLine(sc);
8732 //if (res != VKFFT_SUCCESS) return res;
8733
8734 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
8735 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
8736 res = VkAppendLine(sc);
8737 if (res != VKFFT_SUCCESS) return res;
8738 }
8739 if (sc->mergeSequencesR2C)
8740 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult);
8741 else
8742 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
8743 res = VkAppendLine(sc);
8744 if (res != VKFFT_SUCCESS) return res;
8745 if (sc->zeropadBluestein[0]) {
8746 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
8747 res = VkAppendLine(sc);
8748 if (res != VKFFT_SUCCESS) return res;
8749 }
8750
8751 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);
8752 res = VkAppendLine(sc);
8753 if (res != VKFFT_SUCCESS) return res;
8754 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
8755 res = VkAppendLine(sc);
8756 if (res != VKFFT_SUCCESS) return res;
8757 if (sc->mergeSequencesR2C) {
8758 sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);
8759
8760 sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
8761 }
8762 else {
8763 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
8764 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
8765 }
8766 res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8767 if (res != VKFFT_SUCCESS) return res;
8768 sc->tempLen = sprintf(sc->tempStr, ";\n");
8769 res = VkAppendLine(sc);
8770 if (res != VKFFT_SUCCESS) return res;
8772 if (res != VKFFT_SUCCESS) return res;
8773 if (sc->zeropad[0]) {
8774 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
8775 res = VkAppendLine(sc);
8776 if (res != VKFFT_SUCCESS) return res;
8777 }
8778 if (sc->inputBufferBlockNum == 1)
8779 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
8780 else
8781 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
8782 res = VkAppendLine(sc);
8783 if (res != VKFFT_SUCCESS) return res;
8784 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8785 res = VkAppendLine(sc);
8786 if (res != VKFFT_SUCCESS) return res;
8787
8789 if (res != VKFFT_SUCCESS) return res;
8790 if (sc->zeropad[0]) {
8791 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
8792 res = VkAppendLine(sc);
8793 if (res != VKFFT_SUCCESS) return res;
8794
8795 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n");
8796 res = VkAppendLine(sc);
8797 if (res != VKFFT_SUCCESS) return res;
8798 sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n");
8799 res = VkAppendLine(sc);
8800 if (res != VKFFT_SUCCESS) return res;
8801 sc->tempLen = sprintf(sc->tempStr, " }\n");
8802 res = VkAppendLine(sc);
8803 if (res != VKFFT_SUCCESS) return res;
8804 }
8805 if (sc->zeropadBluestein[0]) {
8806 sc->tempLen = sprintf(sc->tempStr, " }\n");
8807 res = VkAppendLine(sc);
8808 if (res != VKFFT_SUCCESS) return res;
8809 }
8810 if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
8811 sc->tempLen = sprintf(sc->tempStr, " }\n");
8812 res = VkAppendLine(sc);
8813 if (res != VKFFT_SUCCESS) return res;
8814 }
8815 }
8816 }
8817 res = appendBarrierVkFFT(sc, 1);
8818 if (res != VKFFT_SUCCESS) return res;
8819 res = appendZeropadStart(sc);
8820 if (res != VKFFT_SUCCESS) return res;
8821 for (uint64_t k = 0; k < sc->registerBoost; k++) {
8822 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
8823 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
8824 res = VkAppendLine(sc);
8825 if (res != VKFFT_SUCCESS) return res;
8826 if (sc->axisSwapped) {
8827 if (sc->zeropadBluestein[0]) {
8828 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim);
8829 res = VkAppendLine(sc);
8830 if (res != VKFFT_SUCCESS) return res;
8831 }
8832 }
8833 else {
8834 if (sc->zeropadBluestein[0]) {
8835 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim);
8836 res = VkAppendLine(sc);
8837 if (res != VKFFT_SUCCESS) return res;
8838 }
8839 }
8840 sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2);
8841 res = VkAppendLine(sc);
8842 if (res != VKFFT_SUCCESS) return res;
8843
8844 sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
8845 res = VkAppendLine(sc);
8846 if (res != VKFFT_SUCCESS) return res;
8847 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
8848 res = VkAppendLine(sc);
8849 if (res != VKFFT_SUCCESS) return res;
8850 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
8851 res = VkAppendLine(sc);
8852 if (res != VKFFT_SUCCESS) return res;
8853 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
8854 res = VkAppendLine(sc);
8855 if (res != VKFFT_SUCCESS) return res;
8856 sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
8857 res = VkAppendLine(sc);
8858 if (res != VKFFT_SUCCESS) return res;
8859 sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);
8860 res = VkAppendLine(sc);
8861 if (res != VKFFT_SUCCESS) return res;
8862 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
8863 res = VkAppendLine(sc);
8864 if (res != VKFFT_SUCCESS) return res;
8865 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8866 %s.x = -%s.x;\n\
8867 %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8868 res = VkAppendLine(sc);
8869 if (res != VKFFT_SUCCESS) return res;
8870 sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
8871 %s.x = -%s.x;\n\
8872 %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
8873 res = VkAppendLine(sc);
8874 if (res != VKFFT_SUCCESS) return res;
8875 if (sc->zeropadBluestein[0]) {
8876 sc->tempLen = sprintf(sc->tempStr, " }\n");
8877 res = VkAppendLine(sc);
8878 if (res != VKFFT_SUCCESS) return res;
8879 }
8880 }
8881 }
8882 res = appendZeropadEnd(sc);
8883 if (res != VKFFT_SUCCESS) return res;
8884 if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
8885 }
8886 else {
8887 //Not implemented
8888 }
8889 break;
8890 }
8891 }
8892 return res;
8893}
8894
8895static inline VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) {
8897 char vecType[30];
8898 char LFending[4] = "";
8899 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
8900#if(VKFFT_BACKEND==0)
8901 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
8902 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
8903 char cosDef[20] = "cos";
8904 char sinDef[20] = "sin";
8905 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
8906#elif(VKFFT_BACKEND==1)
8907 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
8908 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
8909 char cosDef[20] = "__cosf";
8910 char sinDef[20] = "__sinf";
8911 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
8912#elif(VKFFT_BACKEND==2)
8913 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
8914 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
8915 char cosDef[20] = "__cosf";
8916 char sinDef[20] = "__sinf";
8917 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
8918#elif(VKFFT_BACKEND==3)
8919 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
8920 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
8921 char cosDef[20] = "native_cos";
8922 char sinDef[20] = "native_sin";
8923 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
8924#endif
8925
8926 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
8927 switch (reorderType) {
8928 case 1: {//grouped_c2c
8929 char shiftX[500] = "";
8930 if (sc->performWorkGroupShift[0])
8931 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
8932 if ((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)) {
8933 if (!sc->readToRegisters) {
8934 res = appendBarrierVkFFT(sc, 1);
8935 if (res != VKFFT_SUCCESS) return res;
8936 }
8937 /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
8938 res = appendBarrierVkFFT(sc, 1);
8939 if (res != VKFFT_SUCCESS) return res;
8940 sc->readToRegisters = 0;
8941 }
8942 else
8943 sc->readToRegisters = 1;*/
8944 res = appendZeropadStart(sc);
8945 if (res != VKFFT_SUCCESS) return res;
8947 if (res != VKFFT_SUCCESS) return res;
8948 for (uint64_t i = 0; i < sc->fftDim / sc->localSize[1]; i++) {
8949 uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
8950 if (sc->LUT) {
8951 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
8952 res = VkAppendLine(sc);
8953 if (res != VKFFT_SUCCESS) return res;
8954 if (!sc->inverse) {
8955 sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n");
8956 res = VkAppendLine(sc);
8957 if (res != VKFFT_SUCCESS) return res;
8958 }
8959 }
8960 else {
8961 sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
8962 res = VkAppendLine(sc);
8963 if (res != VKFFT_SUCCESS) return res;
8964 if (!strcmp(floatType, "float")) {
8965 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
8966 res = VkAppendLine(sc);
8967 if (res != VKFFT_SUCCESS) return res;
8968 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef);
8969 res = VkAppendLine(sc);
8970 if (res != VKFFT_SUCCESS) return res;
8971 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
8972 }
8973 if (!strcmp(floatType, "double")) {
8974 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n");
8975 res = VkAppendLine(sc);
8976 if (res != VKFFT_SUCCESS) return res;
8977 }
8978 }
8979 if (sc->readToRegisters) {
8980 sc->tempLen = sprintf(sc->tempStr, "\
8981 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
8982 res = VkAppendLine(sc);
8983 if (res != VKFFT_SUCCESS) return res;
8984 sc->tempLen = sprintf(sc->tempStr, "\
8985 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
8986 res = VkAppendLine(sc);
8987 if (res != VKFFT_SUCCESS) return res;
8988 sc->tempLen = sprintf(sc->tempStr, "\
8989 %s.x = w.x;\n", sc->regIDs[id]);
8990 res = VkAppendLine(sc);
8991 if (res != VKFFT_SUCCESS) return res;
8992 }
8993 else {
8994 sc->tempLen = sprintf(sc->tempStr, "\
8995 %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
8996 res = VkAppendLine(sc);
8997 if (res != VKFFT_SUCCESS) return res;
8998
8999 sc->tempLen = sprintf(sc->tempStr, "\
9000 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
9001 res = VkAppendLine(sc);
9002 if (res != VKFFT_SUCCESS) return res;
9003
9004 sc->tempLen = sprintf(sc->tempStr, "\
9005 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
9006 res = VkAppendLine(sc);
9007 if (res != VKFFT_SUCCESS) return res;
9008 sc->tempLen = sprintf(sc->tempStr, "\
9009 sdata[%s].x = w.x;\n", sc->inoutID);
9010 res = VkAppendLine(sc);
9011 if (res != VKFFT_SUCCESS) return res;
9012 }
9013 }
9015 if (res != VKFFT_SUCCESS) return res;
9016 res = appendZeropadEnd(sc);
9017 if (res != VKFFT_SUCCESS) return res;
9018 }
9019
9020 break;
9021 }
9022 case 2: {//single_c2c_strided
9023 char shiftX[500] = "";
9024 if (sc->performWorkGroupShift[0])
9025 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
9026 if ((!sc->reorderFourStep) && (sc->inverse)) {
9027 if (!sc->readToRegisters) {
9028 res = appendBarrierVkFFT(sc, 1);
9029 if (res != VKFFT_SUCCESS) return res;
9030 }
9031 /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
9032 res = appendBarrierVkFFT(sc, 1);
9033 sc->readToRegisters = 0;
9034 }
9035 else
9036 sc->readToRegisters = 1;*/
9037 res = appendZeropadStart(sc);
9038 if (res != VKFFT_SUCCESS) return res;
9040 if (res != VKFFT_SUCCESS) return res;
9041 for (uint64_t i = 0; i < sc->fftDim / sc->localSize[1]; i++) {
9042 uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
9043 if (sc->LUT) {
9044 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize);
9045 res = VkAppendLine(sc);
9046 if (res != VKFFT_SUCCESS) return res;
9047 if (!sc->inverse) {
9048 sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n");
9049 res = VkAppendLine(sc);
9050 if (res != VKFFT_SUCCESS) return res;
9051 }
9052 }
9053 else {
9054 sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
9055 res = VkAppendLine(sc);
9056 if (res != VKFFT_SUCCESS) return res;
9057
9058 if (!strcmp(floatType, "float")) {
9059 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
9060 res = VkAppendLine(sc);
9061 if (res != VKFFT_SUCCESS) return res;
9062 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef);
9063 res = VkAppendLine(sc);
9064 if (res != VKFFT_SUCCESS) return res;
9065 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
9066 }
9067 if (!strcmp(floatType, "double")) {
9068 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n");
9069 res = VkAppendLine(sc);
9070 if (res != VKFFT_SUCCESS) return res;
9071 }
9072 }
9073 if (sc->readToRegisters) {
9074 sc->tempLen = sprintf(sc->tempStr, "\
9075 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
9076 res = VkAppendLine(sc);
9077 if (res != VKFFT_SUCCESS) return res;
9078 sc->tempLen = sprintf(sc->tempStr, "\
9079 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
9080 res = VkAppendLine(sc);
9081 if (res != VKFFT_SUCCESS) return res;
9082 sc->tempLen = sprintf(sc->tempStr, "\
9083 %s.x = w.x;\n", sc->regIDs[id]);
9084 res = VkAppendLine(sc);
9085 if (res != VKFFT_SUCCESS) return res;
9086 }
9087 else {
9088 sc->tempLen = sprintf(sc->tempStr, "\
9089 %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
9090 res = VkAppendLine(sc);
9091 if (res != VKFFT_SUCCESS) return res;
9092
9093 sc->tempLen = sprintf(sc->tempStr, "\
9094 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
9095 res = VkAppendLine(sc);
9096 if (res != VKFFT_SUCCESS) return res;
9097
9098 sc->tempLen = sprintf(sc->tempStr, "\
9099 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
9100 res = VkAppendLine(sc);
9101 if (res != VKFFT_SUCCESS) return res;
9102 sc->tempLen = sprintf(sc->tempStr, "\
9103 sdata[%s].x = w.x;\n", sc->inoutID);
9104 res = VkAppendLine(sc);
9105 if (res != VKFFT_SUCCESS) return res;
9106 }
9107 }
9109 if (res != VKFFT_SUCCESS) return res;
9110 res = appendZeropadEnd(sc);
9111 if (res != VKFFT_SUCCESS) return res;
9112 }
9113 //appendBarrierVkFFT(sc, 1);
9114 break;
9115 }
9116 }
9117 return res;
9118}
9119static inline VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) {
9121 char vecType[30];
9122 char LFending[4] = "";
9123 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
9124#if(VKFFT_BACKEND==0)
9125 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
9126 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
9127 char cosDef[20] = "cos";
9128 char sinDef[20] = "sin";
9129 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
9130#elif(VKFFT_BACKEND==1)
9131 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9132 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9133 char cosDef[20] = "__cosf";
9134 char sinDef[20] = "__sinf";
9135 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9136#elif(VKFFT_BACKEND==2)
9137 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9138 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9139 char cosDef[20] = "__cosf";
9140 char sinDef[20] = "__sinf";
9141 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9142#elif(VKFFT_BACKEND==3)
9143 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9144 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9145 char cosDef[20] = "native_cos";
9146 char sinDef[20] = "native_sin";
9147 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9148#endif
9149
9150 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9151 switch (reorderType) {
9152 case 1: {//grouped_c2c
9153 char shiftX[500] = "";
9154 if (sc->performWorkGroupShift[0])
9155 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
9156 if ((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) {
9157 if (!sc->writeFromRegisters) {
9158 res = appendBarrierVkFFT(sc, 1);
9159 if (res != VKFFT_SUCCESS) return res;
9160 }
9161 /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
9162 res = appendBarrierVkFFT(sc, 1);
9163 if (res != VKFFT_SUCCESS) return res;
9164 sc->writeFromRegisters = 0;
9165 }
9166 else
9167 sc->writeFromRegisters = 1;*/
9168 res = appendZeropadStart(sc);
9169 if (res != VKFFT_SUCCESS) return res;
9171 if (res != VKFFT_SUCCESS) return res;
9172 for (uint64_t i = 0; i < sc->fftDim / sc->localSize[1]; i++) {
9173 uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
9174 if (sc->LUT) {
9175 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
9176 res = VkAppendLine(sc);
9177 if (res != VKFFT_SUCCESS) return res;
9178 if (!sc->inverse) {
9179 sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n");
9180 res = VkAppendLine(sc);
9181 if (res != VKFFT_SUCCESS) return res;
9182 }
9183 }
9184 else {
9185 sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
9186 res = VkAppendLine(sc);
9187 if (res != VKFFT_SUCCESS) return res;
9188 if (sc->inverse) {
9189 if (!strcmp(floatType, "float")) {
9190 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
9191 res = VkAppendLine(sc);
9192 if (res != VKFFT_SUCCESS) return res;
9193 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef);
9194 res = VkAppendLine(sc);
9195 if (res != VKFFT_SUCCESS) return res;
9196 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
9197 }
9198 if (!strcmp(floatType, "double")) {
9199 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n");
9200 res = VkAppendLine(sc);
9201 if (res != VKFFT_SUCCESS) return res;
9202 }
9203 }
9204 else {
9205 if (!strcmp(floatType, "float")) {
9206 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
9207 res = VkAppendLine(sc);
9208 if (res != VKFFT_SUCCESS) return res;
9209 sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef);
9210 res = VkAppendLine(sc);
9211 if (res != VKFFT_SUCCESS) return res;
9212 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
9213 }
9214 if (!strcmp(floatType, "double")) {
9215 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n");
9216 res = VkAppendLine(sc);
9217 if (res != VKFFT_SUCCESS) return res;
9218 }
9219 }
9220 }
9221 if (sc->writeFromRegisters) {
9222 sc->tempLen = sprintf(sc->tempStr, "\
9223 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
9224 res = VkAppendLine(sc);
9225 if (res != VKFFT_SUCCESS) return res;
9226 sc->tempLen = sprintf(sc->tempStr, "\
9227 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
9228 res = VkAppendLine(sc);
9229 if (res != VKFFT_SUCCESS) return res;
9230 sc->tempLen = sprintf(sc->tempStr, "\
9231 %s.x = w.x;\n", sc->regIDs[id]);
9232 res = VkAppendLine(sc);
9233 if (res != VKFFT_SUCCESS) return res;
9234 }
9235 else {
9236 sc->tempLen = sprintf(sc->tempStr, "\
9237 %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
9238 res = VkAppendLine(sc);
9239 if (res != VKFFT_SUCCESS) return res;
9240
9241 sc->tempLen = sprintf(sc->tempStr, "\
9242 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
9243 res = VkAppendLine(sc);
9244 if (res != VKFFT_SUCCESS) return res;
9245
9246 sc->tempLen = sprintf(sc->tempStr, "\
9247 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
9248 res = VkAppendLine(sc);
9249 if (res != VKFFT_SUCCESS) return res;
9250 sc->tempLen = sprintf(sc->tempStr, "\
9251 sdata[%s].x = w.x;\n", sc->inoutID);
9252 res = VkAppendLine(sc);
9253 if (res != VKFFT_SUCCESS) return res;
9254 }
9255 }
9257 if (res != VKFFT_SUCCESS) return res;
9258 res = appendZeropadEnd(sc);
9259 if (res != VKFFT_SUCCESS) return res;
9260 }
9261 break;
9262 }
9263 case 2: {//single_c2c_strided
9264 char shiftX[500] = "";
9265 if (sc->performWorkGroupShift[0])
9266 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
9267 if (!((!sc->reorderFourStep) && (sc->inverse))) {
9268 if (!sc->writeFromRegisters) {
9269 res = appendBarrierVkFFT(sc, 1);
9270 if (res != VKFFT_SUCCESS) return res;
9271 }
9272 /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
9273 res = appendBarrierVkFFT(sc, 1);
9274 if (res != VKFFT_SUCCESS) return res;
9275 sc->writeFromRegisters = 0;
9276 }
9277 else
9278 sc->writeFromRegisters = 1;*/
9279 res = appendZeropadStart(sc);
9280 if (res != VKFFT_SUCCESS) return res;
9282 if (res != VKFFT_SUCCESS) return res;
9283 for (uint64_t i = 0; i < sc->fftDim / sc->localSize[1]; i++) {
9284 uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
9285 if (sc->LUT) {
9286 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize);
9287 res = VkAppendLine(sc);
9288 if (res != VKFFT_SUCCESS) return res;
9289 if (!sc->inverse) {
9290 sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n");
9291 res = VkAppendLine(sc);
9292 if (res != VKFFT_SUCCESS) return res;
9293 }
9294 }
9295 else {
9296 sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
9297 res = VkAppendLine(sc);
9298 if (res != VKFFT_SUCCESS) return res;
9299 if (sc->inverse) {
9300 if (!strcmp(floatType, "float")) {
9301 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
9302 res = VkAppendLine(sc);
9303 if (res != VKFFT_SUCCESS) return res;
9304 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef);
9305 res = VkAppendLine(sc);
9306 if (res != VKFFT_SUCCESS) return res;
9307 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
9308 }
9309 if (!strcmp(floatType, "double")) {
9310 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n");
9311 res = VkAppendLine(sc);
9312 if (res != VKFFT_SUCCESS) return res;
9313 }
9314 }
9315 else {
9316 if (!strcmp(floatType, "float")) {
9317 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef);
9318 res = VkAppendLine(sc);
9319 if (res != VKFFT_SUCCESS) return res;
9320 sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef);
9321 res = VkAppendLine(sc);
9322 if (res != VKFFT_SUCCESS) return res;
9323 //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType);
9324 }
9325 if (!strcmp(floatType, "double")) {
9326 sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n");
9327 res = VkAppendLine(sc);
9328 if (res != VKFFT_SUCCESS) return res;
9329 }
9330 }
9331 }
9332 if (sc->writeFromRegisters) {
9333 sc->tempLen = sprintf(sc->tempStr, "\
9334 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
9335 res = VkAppendLine(sc);
9336 if (res != VKFFT_SUCCESS) return res;
9337 sc->tempLen = sprintf(sc->tempStr, "\
9338 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
9339 res = VkAppendLine(sc);
9340 if (res != VKFFT_SUCCESS) return res;
9341 sc->tempLen = sprintf(sc->tempStr, "\
9342 %s.x = w.x;\n", sc->regIDs[id]);
9343 res = VkAppendLine(sc);
9344 if (res != VKFFT_SUCCESS) return res;
9345 }
9346 else {
9347 sc->tempLen = sprintf(sc->tempStr, "\
9348 %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
9349 res = VkAppendLine(sc);
9350 if (res != VKFFT_SUCCESS) return res;
9351
9352 sc->tempLen = sprintf(sc->tempStr, "\
9353 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
9354 res = VkAppendLine(sc);
9355 if (res != VKFFT_SUCCESS) return res;
9356
9357 sc->tempLen = sprintf(sc->tempStr, "\
9358 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
9359 res = VkAppendLine(sc);
9360 if (res != VKFFT_SUCCESS) return res;
9361 sc->tempLen = sprintf(sc->tempStr, "\
9362 sdata[%s].x = w.x;\n", sc->inoutID);
9363 res = VkAppendLine(sc);
9364 if (res != VKFFT_SUCCESS) return res;
9365 }
9366 }
9368 if (res != VKFFT_SUCCESS) return res;
9369 res = appendZeropadEnd(sc);
9370 if (res != VKFFT_SUCCESS) return res;
9371 }
9372 //appendBarrierVkFFT(sc, 1);
9373 break;
9374 }
9375 }
9376 return res;
9377}
9378
9379static inline VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t strideType, uint64_t pre_or_post_multiplication) {
9381 char vecType[30];
9382 char LFending[4] = "";
9383 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
9384#if(VKFFT_BACKEND==0)
9385 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
9386 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
9387 char cosDef[20] = "cos";
9388 char sinDef[20] = "sin";
9389 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
9390#elif(VKFFT_BACKEND==1)
9391 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9392 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9393 char cosDef[20] = "__cosf";
9394 char sinDef[20] = "__sinf";
9395 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9396#elif(VKFFT_BACKEND==2)
9397 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9398 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9399 char cosDef[20] = "__cosf";
9400 char sinDef[20] = "__sinf";
9401 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9402#elif(VKFFT_BACKEND==3)
9403 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9404 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9405 char cosDef[20] = "native_cos";
9406 char sinDef[20] = "native_sin";
9407 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9408#endif
9409 char shiftX[500] = "";
9410 if (sc->performWorkGroupShift[0])
9411 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
9412 char requestCoordinate[100] = "";
9413
9414 char index_x[2000] = "";
9415 char index_y[2000] = "";
9416 char requestBatch[100] = "";
9417 char separateRegisterStore[100] = "";
9418 char kernelName[100] = "";
9419 sprintf(kernelName, "BluesteinMultiplication");
9420 if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
9421 res = appendBarrierVkFFT(sc, 1);
9422 if (res != VKFFT_SUCCESS) return res;
9423 }
9425 if (res != VKFFT_SUCCESS) return res;
9426 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
9427 switch (strideType) {
9428 case 0: case 2: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
9429 {
9430 if (sc->fftDim == sc->fft_dim_full) {
9431 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
9432
9433 res = VkAppendLine(sc);
9434 if (res != VKFFT_SUCCESS) return res;
9435 }
9436 else {
9437 sprintf(index_x, " (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim);
9438 sc->tempLen = sprintf(sc->tempStr, " %s = %s;\n", sc->inoutID, index_x);
9439 res = VkAppendLine(sc);
9440 if (res != VKFFT_SUCCESS) return res;
9441 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
9442 }
9443 break;
9444 }
9445 case 1: case 111: case 121: case 131: case 141: case 143: case 145:
9446 {
9447 if (sc->fftDim == sc->fft_dim_full) {
9448 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
9449 res = VkAppendLine(sc);
9450 if (res != VKFFT_SUCCESS) return res;
9451 }
9452 else {
9453 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
9454 res = VkAppendLine(sc);
9455 if (res != VKFFT_SUCCESS) return res;
9456 }
9457 break;
9458 }
9459 }
9460 if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) {
9461 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
9462 res = VkAppendLine(sc);
9463 if (res != VKFFT_SUCCESS) return res;
9464 }
9465 if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) {
9466 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
9467 res = VkAppendLine(sc);
9468 if (res != VKFFT_SUCCESS) return res;
9469 }
9470 sc->tempLen = sprintf(sc->tempStr, " w = %s[%s];\n", kernelName, sc->inoutID);
9471 res = VkAppendLine(sc);
9472 if (res != VKFFT_SUCCESS) return res;
9473 uint64_t k = 0;
9474 if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
9475 if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
9476 sc->tempLen = sprintf(sc->tempStr, "\
9477 %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
9478 res = VkAppendLine(sc);
9479 if (res != VKFFT_SUCCESS) return res;
9480 }
9481 else {
9482 sc->tempLen = sprintf(sc->tempStr, "\
9483 %s = sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride];\n", sc->regIDs[i], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y);
9484 res = VkAppendLine(sc);
9485 if (res != VKFFT_SUCCESS) return res;
9486 }
9487 }
9488
9489 if (sc->inverseBluestein)
9490 res = VkMulComplex(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp);
9491 else
9492 res = VkMulComplexConj(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp);
9493 if (res != VKFFT_SUCCESS) return res;
9494
9495 if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
9496 if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
9497 sc->tempLen = sprintf(sc->tempStr, "\
9498 sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
9499 res = VkAppendLine(sc);
9500 if (res != VKFFT_SUCCESS) return res;
9501 }
9502 else {
9503 sc->tempLen = sprintf(sc->tempStr, "\
9504 sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride] = %s;\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->regIDs[i]);
9505 res = VkAppendLine(sc);
9506 if (res != VKFFT_SUCCESS) return res;
9507 }
9508 }
9509 if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) {
9510 sc->tempLen = sprintf(sc->tempStr, " }\n");
9511 res = VkAppendLine(sc);
9512 if (res != VKFFT_SUCCESS) return res;
9513 }
9514 if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) {
9515 sc->tempLen = sprintf(sc->tempStr, " }\n");
9516 res = VkAppendLine(sc);
9517 if (res != VKFFT_SUCCESS) return res;
9518 }
9519 }
9521 if (res != VKFFT_SUCCESS) return res;
9522 return res;
9523}
9524
9525static inline VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix) {
9527 char vecType[30];
9528 char LFending[4] = "";
9529 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
9530#if(VKFFT_BACKEND==0)
9531 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
9532 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
9533 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
9534#elif(VKFFT_BACKEND==1)
9535 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9536 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9537 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9538#elif(VKFFT_BACKEND==2)
9539 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9540 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9541 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9542#elif(VKFFT_BACKEND==3)
9543 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9544 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9545 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9546#endif
9547
9548 char convolutionInverse[10] = "";
9549 if (sc->convolutionStep) {
9550 if (stageAngle < 0)
9551 sprintf(convolutionInverse, ", 0");
9552 else
9553 sprintf(convolutionInverse, ", 1");
9554 }
9555 uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
9556 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9557 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
9558 if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))
9559 {
9560 res = appendBarrierVkFFT(sc, 1);
9561 if (res != VKFFT_SUCCESS) return res;
9562 }
9563 res = appendZeropadStart(sc);
9564 if (res != VKFFT_SUCCESS) return res;
9566 if (res != VKFFT_SUCCESS) return res;
9567
9568 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim) {
9569 sc->tempLen = sprintf(sc->tempStr, "\
9570 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
9571 res = VkAppendLine(sc);
9572 if (res != VKFFT_SUCCESS) return res;
9573 }
9574 for (uint64_t k = 0; k < sc->registerBoost; k++) {
9575 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9576 sc->tempLen = sprintf(sc->tempStr, "\
9577 %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
9578 res = VkAppendLine(sc);
9579 if (res != VKFFT_SUCCESS) return res;
9580 if (sc->LUT)
9581 sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
9582 else
9583 sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17f%s;\n", stageAngle, LFending);
9584 res = VkAppendLine(sc);
9585 if (res != VKFFT_SUCCESS) return res;
9586 if ((!((sc->readToRegisters == 1) && (stageSize==1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) {
9587 //if(sc->readToRegisters==0){
9588 for (uint64_t i = 0; i < stageRadix; i++) {
9589 uint64_t id = j + i * logicalRegistersPerThread / stageRadix;
9590 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9591
9592 sc->tempLen = sprintf(sc->tempStr, "\
9593 %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix);
9594 res = VkAppendLine(sc);
9595 if (res != VKFFT_SUCCESS) return res;
9596
9597 if (sc->resolveBankConflictFirstStages == 1) {
9598 sc->tempLen = sprintf(sc->tempStr, "\
9599 %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
9600 res = VkAppendLine(sc);
9601 if (res != VKFFT_SUCCESS) return res;
9602 }
9603
9604 if (sc->localSize[1] > 1) {
9605 sc->tempLen = sprintf(sc->tempStr, "\
9606 %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
9607 res = VkAppendLine(sc);
9608 if (res != VKFFT_SUCCESS) return res;
9609 }
9610 sc->tempLen = sprintf(sc->tempStr, "\
9611 %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID);
9612 res = VkAppendLine(sc);
9613 if (res != VKFFT_SUCCESS) return res;
9614 }
9615 }
9616 char** regID = (char**)malloc(sizeof(char*) * stageRadix);
9617 if (regID) {
9618 for (uint64_t i = 0; i < stageRadix; i++) {
9619 regID[i] = (char*)malloc(sizeof(char) * 50);
9620 if (!regID[i]) {
9621 for (uint64_t j = 0; j < i; j++) {
9622 free(regID[j]);
9623 regID[j] = 0;
9624 }
9625 free(regID);
9626 regID = 0;
9628 }
9629 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9630 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9631 sprintf(regID[i], "%s", sc->regIDs[id]);
9632 /*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
9633 sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]);
9634 else
9635 sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/
9636
9637 }
9638 res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageAngle, regID);
9639 if (res != VKFFT_SUCCESS) return res;
9640 for (uint64_t i = 0; i < stageRadix; i++) {
9641 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9642 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9643 sprintf(sc->regIDs[id], "%s", regID[i]);
9644 }
9645 for (uint64_t i = 0; i < stageRadix; i++) {
9646 free(regID[i]);
9647 regID[i] = 0;
9648 }
9649 free(regID);
9650 regID = 0;
9651 }
9652 else
9654 }
9655 if ((stageSize == 1) && (sc->cacheShuffle)) {
9656 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
9657 uint64_t id = i + k * logicalRegistersPerThread;
9658 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9659 sc->tempLen = sprintf(sc->tempStr, "\
9660 shuffle[%" PRIu64 "]=%s;\n", i, sc->regIDs[id]);
9661 res = VkAppendLine(sc);
9662 if (res != VKFFT_SUCCESS) return res;
9663 }
9664 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
9665 uint64_t id = i + k * logicalRegistersPerThread;
9666 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9667 sc->tempLen = sprintf(sc->tempStr, "\
9668 %s=shuffle[(%" PRIu64 "+tshuffle)%%(%" PRIu64 ")];\n", sc->regIDs[id], i, logicalRegistersPerThread);
9669 res = VkAppendLine(sc);
9670 if (res != VKFFT_SUCCESS) return res;
9671 }
9672 }
9673 }
9674 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim) {
9675 sc->tempLen = sprintf(sc->tempStr, " }\n");
9676 res = VkAppendLine(sc);
9677 if (res != VKFFT_SUCCESS) return res;
9678 }
9680 if (res != VKFFT_SUCCESS) return res;
9681 res = appendZeropadEnd(sc);
9682 if (res != VKFFT_SUCCESS) return res;
9683 return res;
9684}
9685static inline VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix) {
9687 char vecType[30];
9688 char LFending[4] = "";
9689 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
9690#if(VKFFT_BACKEND==0)
9691 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
9692 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
9693 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
9694#elif(VKFFT_BACKEND==1)
9695 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9696 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9697 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9698#elif(VKFFT_BACKEND==2)
9699 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9700 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9701 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9702#elif(VKFFT_BACKEND==3)
9703 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9704 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9705 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9706#endif
9707
9708 char convolutionInverse[10] = "";
9709 if (sc->convolutionStep) {
9710 if (stageAngle < 0)
9711 sprintf(convolutionInverse, ", 0");
9712 else
9713 sprintf(convolutionInverse, ", 1");
9714 }
9715 uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
9716 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9717 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
9718 if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))
9719 {
9720 res = appendBarrierVkFFT(sc, 1);
9721 if (res != VKFFT_SUCCESS) return res;
9722 }
9723 res = appendZeropadStart(sc);
9724 if (res != VKFFT_SUCCESS) return res;
9726 if (res != VKFFT_SUCCESS) return res;
9727 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
9728 sc->tempLen = sprintf(sc->tempStr, "\
9729 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
9730 res = VkAppendLine(sc);
9731 if (res != VKFFT_SUCCESS) return res;
9732 }
9733 for (uint64_t k = 0; k < sc->registerBoost; k++) {
9734 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9735 sc->tempLen = sprintf(sc->tempStr, "\
9736 %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
9737 res = VkAppendLine(sc);
9738 if (res != VKFFT_SUCCESS) return res;
9739 if (sc->LUT)
9740 sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
9741 else
9742 sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17f%s;\n", stageAngle, LFending);
9743 res = VkAppendLine(sc);
9744 if (res != VKFFT_SUCCESS) return res;
9745 if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) {
9746 for (uint64_t i = 0; i < stageRadix; i++) {
9747 uint64_t id = j + i * logicalRegistersPerThread / stageRadix;
9748 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9749 sc->tempLen = sprintf(sc->tempStr, "\
9750 %s = sdata[%s*(%s+%" PRIu64 ")+%s];\n", sc->regIDs[id], sc->sharedStride, sc->gl_LocalInvocationID_y, j * logicalGroupSize + i * sc->fftDim / stageRadix, sc->gl_LocalInvocationID_x);
9751 res = VkAppendLine(sc);
9752 if (res != VKFFT_SUCCESS) return res;
9753 }
9754 }
9755
9756 char** regID = (char**)malloc(sizeof(char*) * stageRadix);
9757 if (regID) {
9758 for (uint64_t i = 0; i < stageRadix; i++) {
9759 regID[i] = (char*)malloc(sizeof(char) * 50);
9760 if (!regID[i]) {
9761 for (uint64_t j = 0; j < i; j++) {
9762 free(regID[j]);
9763 regID[j] = 0;
9764 }
9765 free(regID);
9766 regID = 0;
9768 }
9769 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9770 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9771 sprintf(regID[i], "%s", sc->regIDs[id]);
9772 /*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
9773 sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix);
9774 else
9775 sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/
9776
9777 }
9778 res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageAngle, regID);
9779 if (res != VKFFT_SUCCESS) return res;
9780 for (uint64_t i = 0; i < stageRadix; i++) {
9781 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9782 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9783 sprintf(sc->regIDs[id], "%s", regID[i]);
9784 }
9785 for (uint64_t i = 0; i < stageRadix; i++) {
9786 free(regID[i]);
9787 regID[i] = 0;
9788 }
9789 free(regID);
9790 regID = 0;
9791 }
9792 else
9794 }
9795 }
9796 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
9797 sc->tempLen = sprintf(sc->tempStr, " }\n");
9798 res = VkAppendLine(sc);
9799 if (res != VKFFT_SUCCESS) return res;
9800 }
9802 if (res != VKFFT_SUCCESS) return res;
9803 res = appendZeropadEnd(sc);
9804 if (res != VKFFT_SUCCESS) return res;
9805 if (stageSize == 1) {
9806 sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->localSize[0]);
9807 res = VkAppendLine(sc);
9808 if (res != VKFFT_SUCCESS) return res;
9809 }
9810 return res;
9811}
9812static inline VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t shuffleType) {
9814 switch (shuffleType) {
9815 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
9816 res = appendRadixStageNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix);
9817 if (res != VKFFT_SUCCESS) return res;
9818 //appendBarrierVkFFT(sc, 1);
9819 break;
9820 }
9821 case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
9822 res = appendRadixStageStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix);
9823 if (res != VKFFT_SUCCESS) return res;
9824 //appendBarrierVkFFT(sc, 1);
9825 break;
9826 }
9827 }
9828 return res;
9829}
9830
9831static inline VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, double stageAngle) {
9833 /*if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
9834 uint64_t bluesteinInverseNormalize = 1;
9835 if ((sc->useBluesteinFFT) && (stageAngle > 0) && (stageSize == 1) && (sc->normalize) && (sc->axis_upload_id == 0)) bluesteinInverseNormalize = sc->bluesteinNormalizeSize;
9836 char stageNormalization[50] = "";
9837 if ((stageSize == 1) && (sc->performDCT) && (sc->actualInverse)) {
9838 if (sc->performDCT == 4)
9839 sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 4 * bluesteinInverseNormalize);
9840 else
9841 sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 2 * bluesteinInverseNormalize);
9842 }
9843 else
9844 sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * bluesteinInverseNormalize);
9845 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9846 for (uint64_t k = 0; k < sc->registerBoost; ++k) {
9847 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
9848 res = VkDivComplexNumber(sc, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], stageNormalization);
9849 if (res != VKFFT_SUCCESS) return res;
9850 }
9851 }
9852 }*/
9853 return res;
9854}
9855
9856static inline VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) {
9858 char vecType[30];
9859 char LFending[4] = "";
9860 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
9861#if(VKFFT_BACKEND==0)
9862 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
9863 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
9864 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
9865#elif(VKFFT_BACKEND==1)
9866 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9867 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9868 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9869#elif(VKFFT_BACKEND==2)
9870 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9871 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9872 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
9873#elif(VKFFT_BACKEND==3)
9874 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
9875 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
9876#endif
9877 char stageNormalization[50] = "";
9878 uint64_t normalizationValue = 1;
9879 if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
9880 if ((sc->performDCT) && (sc->actualInverse)) {
9881 if (sc->performDCT == 1)
9882 normalizationValue = (sc->sourceFFTSize-1) * 2;
9883 else
9884 normalizationValue = sc->sourceFFTSize * 2;
9885 }
9886 else
9887 normalizationValue = sc->sourceFFTSize;
9888 }
9889 if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
9890 normalizationValue *= sc->fft_dim_full;
9891 }
9892 if (normalizationValue != 1) {
9893 sprintf(stageNormalization, "%.17f%s", 1.0 / (double)(normalizationValue), LFending);
9894 }
9895 char tempNum[50] = "";
9896
9897 uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
9898 uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
9899 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9900 uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
9901
9902 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
9903 uint64_t logicalGroupSizeNext = sc->fftDim / logicalStoragePerThreadNext;
9904 if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep))&&(stageAngle<0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)))) || (sc->performDCT)))
9905 {
9906 res = appendBarrierVkFFT(sc, 1);
9907 if (res != VKFFT_SUCCESS) return res;
9908 }
9909 //if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) {
9910 if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT))) {
9911 if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
9912 char** tempID;
9913 tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
9914 if (tempID) {
9915 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
9916 tempID[i] = (char*)malloc(sizeof(char) * 50);
9917 if (!tempID[i]) {
9918 for (uint64_t j = 0; j < i; j++) {
9919 free(tempID[j]);
9920 tempID[j] = 0;
9921 }
9922 free(tempID);
9923 tempID = 0;
9925 }
9926 }
9927 res = appendZeropadStart(sc);
9928 if (res != VKFFT_SUCCESS) return res;
9930 if (res != VKFFT_SUCCESS) return res;
9931 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim) {
9932 sc->tempLen = sprintf(sc->tempStr, "\
9933 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
9934 res = VkAppendLine(sc);
9935 if (res != VKFFT_SUCCESS) return res;
9936 }
9937 for (uint64_t k = 0; k < sc->registerBoost; ++k) {
9938 uint64_t t = 0;
9939 if (k > 0) {
9940 res = appendBarrierVkFFT(sc, 2);
9941 if (res != VKFFT_SUCCESS) return res;
9942 res = appendZeropadStart(sc);
9943 if (res != VKFFT_SUCCESS) return res;
9945 if (res != VKFFT_SUCCESS) return res;
9946 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim) {
9947 sc->tempLen = sprintf(sc->tempStr, "\
9948 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
9949 res = VkAppendLine(sc);
9950 if (res != VKFFT_SUCCESS) return res;
9951 }
9952 }
9953 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9954 sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize);
9955 res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_x, tempNum);
9956 if (res != VKFFT_SUCCESS) return res;
9957 res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
9958 if (res != VKFFT_SUCCESS) return res;
9959 sprintf(tempNum, "%" PRIu64 "", stageSize);
9960 res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
9961 if (res != VKFFT_SUCCESS) return res;
9963 if (res != VKFFT_SUCCESS) return res;
9964 sprintf(tempNum, "%" PRIu64 "", stageRadix);
9965 res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
9966 if (res != VKFFT_SUCCESS) return res;
9967 res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
9968 if (res != VKFFT_SUCCESS) return res;
9969 /*sc->tempLen = sprintf(sc->tempStr, "\
9970 stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\
9971 blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
9972 inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
9973 if ((stageSize == 1) && (sc->cacheShuffle)) {
9974 for (uint64_t i = 0; i < stageRadix; i++) {
9975 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9976 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
9977 sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
9978 t++;
9979 sprintf(tempNum, "%" PRIu64 "", i);
9980 res = VkAddReal(sc, sc->sdataID, tempNum, sc->tshuffle);
9981 if (res != VKFFT_SUCCESS) return res;
9982 sprintf(tempNum, "%" PRIu64 "", logicalRegistersPerThread);
9983 res = VkModReal(sc, sc->sdataID, sc->sdataID, tempNum);
9984 if (res != VKFFT_SUCCESS) return res;
9985 sprintf(tempNum, "%" PRIu64 "", stageSize);
9986 res = VkMulReal(sc, sc->sdataID, sc->sdataID, tempNum);
9987 if (res != VKFFT_SUCCESS) return res;
9988 if (sc->localSize[1] > 1) {
9990 if (res != VKFFT_SUCCESS) return res;
9991 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
9992 if (res != VKFFT_SUCCESS) return res;
9993 }
9994 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->inoutID);
9995 if (res != VKFFT_SUCCESS) return res;
9996
9997 //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "", i, logicalRegistersPerThread, stageSize);
9998 if (strcmp(stageNormalization, "")) {
9999 res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
10000 if (res != VKFFT_SUCCESS) return res;
10001 }
10002 res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
10003 if (res != VKFFT_SUCCESS) return res;
10004 /*sc->tempLen = sprintf(sc->tempStr, "\
10005 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "] = temp%s%s;\n", i, logicalRegistersPerThread, stageSize, sc->regIDs[id], stageNormalization);*/
10006 }
10007 }
10008 else {
10009 for (uint64_t i = 0; i < stageRadix; i++) {
10010 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10011 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
10012 sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
10013 t++;
10014 sprintf(tempNum, "%" PRIu64 "", i * stageSize);
10015 res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum);
10016 if (res != VKFFT_SUCCESS) return res;
10017 if ((stageSize <= sc->numSharedBanks / 2) && (sc->fftDim > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != sc->fftDim / sc->registerBoost) && ((sc->fftDim & (sc->fftDim - 1)) == 0) && (stageSize * stageRadix != sc->fftDim)) {
10018 if (sc->resolveBankConflictFirstStages == 0) {
10020 sc->tempLen = sprintf(sc->tempStr, "\
10021 %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages);
10022 res = VkAppendLine(sc);
10023 if (res != VKFFT_SUCCESS) return res;
10024 }
10025 sc->tempLen = sprintf(sc->tempStr, "\
10026 %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
10027 res = VkAppendLine(sc);
10028 if (res != VKFFT_SUCCESS) return res;
10029
10030 }
10031 else {
10032 if (sc->resolveBankConflictFirstStages == 1) {
10034 sc->tempLen = sprintf(sc->tempStr, "\
10035 %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict);
10036 res = VkAppendLine(sc);
10037 if (res != VKFFT_SUCCESS) return res;
10038 }
10039 }
10040 if (sc->localSize[1] > 1) {
10042 if (res != VKFFT_SUCCESS) return res;
10043 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
10044 if (res != VKFFT_SUCCESS) return res;
10045 }
10046 //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
10047 if (strcmp(stageNormalization, "")) {
10048 res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
10049 if (res != VKFFT_SUCCESS) return res;
10050 }
10051 res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
10052 if (res != VKFFT_SUCCESS) return res;
10053 /*sc->tempLen = sprintf(sc->tempStr, "\
10054 sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
10055 }
10056 }
10057 }
10058 for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
10059 sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
10060 t++;
10061 }
10062 t = 0;
10063 if (sc->registerBoost > 1) {
10064 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim)
10065 {
10066 sc->tempLen = sprintf(sc->tempStr, " }\n");
10067 res = VkAppendLine(sc);
10068 if (res != VKFFT_SUCCESS) return res;
10069 }
10071 if (res != VKFFT_SUCCESS) return res;
10072 res = appendZeropadEnd(sc);
10073 if (res != VKFFT_SUCCESS) return res;
10074 res = appendBarrierVkFFT(sc, 2);
10075 if (res != VKFFT_SUCCESS) return res;
10076 res = appendZeropadStart(sc);
10077 if (res != VKFFT_SUCCESS) return res;
10079 if (res != VKFFT_SUCCESS) return res;
10080 if (sc->localSize[0] * logicalStoragePerThreadNext > sc->fftDim) {
10081 sc->tempLen = sprintf(sc->tempStr, "\
10082 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThreadNext, sc->fftDim);
10083 res = VkAppendLine(sc);
10084 if (res != VKFFT_SUCCESS) return res;
10085 }
10086 for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
10087 for (uint64_t i = 0; i < stageRadixNext; i++) {
10088 uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalStoragePerThreadNext / stageRadixNext;
10089 id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext;
10090 //resID[t + k * sc->registers_per_thread] = sc->regIDs[id];
10091 sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext);
10092 res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_x, tempNum);
10093 if (res != VKFFT_SUCCESS) return res;
10094 if (sc->localSize[1] > 1) {
10096 if (res != VKFFT_SUCCESS) return res;
10097 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
10098 if (res != VKFFT_SUCCESS) return res;
10099 }
10100 //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext);
10101 res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID);
10102 if (res != VKFFT_SUCCESS) return res;
10103 /*sc->tempLen = sprintf(sc->tempStr, "\
10104 temp%s = sdata[sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/
10105 t++;
10106 }
10107
10108 }
10109 if (sc->localSize[0] * logicalStoragePerThreadNext > sc->fftDim)
10110 {
10111 sc->tempLen = sprintf(sc->tempStr, " }\n");
10112 res = VkAppendLine(sc);
10113 if (res != VKFFT_SUCCESS) return res;
10114 }
10116 if (res != VKFFT_SUCCESS) return res;
10117 res = appendZeropadEnd(sc);
10118 if (res != VKFFT_SUCCESS) return res;
10119 }
10120 else {
10121 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim)
10122 {
10123 sc->tempLen = sprintf(sc->tempStr, " }\n");
10124 res = VkAppendLine(sc);
10125 if (res != VKFFT_SUCCESS) return res;
10126 }
10128 if (res != VKFFT_SUCCESS) return res;
10129 res = appendZeropadEnd(sc);
10130 if (res != VKFFT_SUCCESS) return res;
10131 }
10132 }
10133 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10134 //printf("0 - %s\n", resID[i]);
10135 sprintf(sc->regIDs[i], "%s", tempID[i]);
10136 //sprintf(resID[i], "%s", tempID[i]);
10137 //printf("1 - %s\n", resID[i]);
10138 }
10139 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10140 free(tempID[i]);
10141 tempID[i] = 0;
10142 }
10143 free(tempID);
10144 tempID = 0;
10145 }
10146 else
10148 }
10149 else {
10150 char** tempID;
10151 tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
10152 if (tempID) {
10153 //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
10154 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10155 tempID[i] = (char*)malloc(sizeof(char) * 50);
10156 if (!tempID[i]) {
10157 for (uint64_t j = 0; j < i; j++) {
10158 free(tempID[j]);
10159 tempID[j] = 0;
10160 }
10161 free(tempID);
10162 tempID = 0;
10164 }
10165 }
10166 for (uint64_t k = 0; k < sc->registerBoost; ++k) {
10167 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10168 for (uint64_t i = 0; i < stageRadix; i++) {
10169 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10170 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
10171 sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
10172 }
10173 }
10174 for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
10175 sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]);
10176 }
10177 }
10178 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10179 sprintf(sc->regIDs[i], "%s", tempID[i]);
10180 }
10181 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10182 free(tempID[i]);
10183 tempID[i] = 0;
10184 }
10185 free(tempID);
10186 tempID = 0;
10187 }
10188 else
10190 }
10191 }
10192 else {
10193 res = appendZeropadStart(sc);
10194 if (res != VKFFT_SUCCESS) return res;
10196 if (res != VKFFT_SUCCESS) return res;
10197 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim) {
10198 sc->tempLen = sprintf(sc->tempStr, "\
10199 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
10200 res = VkAppendLine(sc);
10201 if (res != VKFFT_SUCCESS) return res;
10202 }
10203 if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
10204 for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
10205 if (strcmp(stageNormalization, "")) {
10206 res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);
10207 }
10208 if (res != VKFFT_SUCCESS) return res;
10209 /*sc->tempLen = sprintf(sc->tempStr, "\
10210 temp%s = temp%s%s;\n", sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);*/
10211 }
10212 }
10213 if (sc->localSize[0] * logicalStoragePerThread > sc->fftDim)
10214 {
10215 sc->tempLen = sprintf(sc->tempStr, " }\n");
10216 res = VkAppendLine(sc);
10217 if (res != VKFFT_SUCCESS) return res;
10218 }
10220 if (res != VKFFT_SUCCESS) return res;
10221 res = appendZeropadEnd(sc);
10222 if (res != VKFFT_SUCCESS) return res;
10223 }
10224 return res;
10225}
10226static inline VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) {
10228 char vecType[30];
10229 char LFending[4] = "";
10230 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
10231#if(VKFFT_BACKEND==0)
10232 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
10233 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
10234 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
10235#elif(VKFFT_BACKEND==1)
10236 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
10237 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
10238 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
10239#elif(VKFFT_BACKEND==2)
10240 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
10241 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
10242 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
10243#elif(VKFFT_BACKEND==3)
10244 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
10245 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
10246#endif
10247
10248 char tempNum[50] = "";
10249
10250 uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10251 uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10252 uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];//(sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
10253 uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
10254
10255 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
10256 uint64_t logicalGroupSizeNext = sc->fftDim / logicalStoragePerThreadNext;
10257 char stageNormalization[50] = "";
10258 uint64_t normalizationValue = 1;
10259 if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
10260 if ((sc->performDCT) && (sc->actualInverse)) {
10261 if (sc->performDCT == 1)
10262 normalizationValue = (sc->sourceFFTSize-1) * 2;
10263 else
10264 normalizationValue = sc->sourceFFTSize * 2;
10265 }
10266 else
10267 normalizationValue = sc->sourceFFTSize;
10268 }
10269 if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
10270 normalizationValue *= sc->fft_dim_full;
10271 }
10272 if (normalizationValue != 1) {
10273 sprintf(stageNormalization, "%.17f%s", 1.0 / (double)(normalizationValue), LFending);
10274 }
10275 if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT)))
10276 {
10277 res = appendBarrierVkFFT(sc, 2);
10278 if (res != VKFFT_SUCCESS) return res;
10279 }
10280 if (stageSize == sc->fftDim / stageRadix) {
10281 sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->sharedStrideReadWriteConflict);
10282 res = VkAppendLine(sc);
10283 if (res != VKFFT_SUCCESS) return res;
10284 }
10285 if ((!((sc->writeFromRegisters == 1)&&(stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1))))))&&(((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) {
10286 //if (sc->writeFromRegisters == 0) {
10287 //appendBarrierVkFFT(sc, 2);
10288 if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
10289 char** tempID;
10290 tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
10291 if (tempID) {
10292 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10293 tempID[i] = (char*)malloc(sizeof(char) * 50);
10294 if (!tempID[i]) {
10295 for (uint64_t j = 0; j < i; j++) {
10296 free(tempID[j]);
10297 tempID[j] = 0;
10298 }
10299 free(tempID);
10300 tempID = 0;
10302 }
10303 }
10304 res = appendZeropadStart(sc);
10305 if (res != VKFFT_SUCCESS) return res;
10307 if (res != VKFFT_SUCCESS) return res;
10308 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
10309 sc->tempLen = sprintf(sc->tempStr, "\
10310 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
10311 res = VkAppendLine(sc);
10312 if (res != VKFFT_SUCCESS) return res;
10313 }
10314 for (uint64_t k = 0; k < sc->registerBoost; ++k) {
10315 uint64_t t = 0;
10316 if (k > 0) {
10317 res = appendBarrierVkFFT(sc, 2);
10318 if (res != VKFFT_SUCCESS) return res;
10319 res = appendZeropadStart(sc);
10320 if (res != VKFFT_SUCCESS) return res;
10322 if (res != VKFFT_SUCCESS) return res;
10323 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
10324 sc->tempLen = sprintf(sc->tempStr, "\
10325 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
10326 res = VkAppendLine(sc);
10327 if (res != VKFFT_SUCCESS) return res;
10328 }
10329 }
10330 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10331 sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize);
10332 res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_y, tempNum);
10333 if (res != VKFFT_SUCCESS) return res;
10334 res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
10335 if (res != VKFFT_SUCCESS) return res;
10336 sprintf(tempNum, "%" PRIu64 "", stageSize);
10337 res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
10338 if (res != VKFFT_SUCCESS) return res;
10340 if (res != VKFFT_SUCCESS) return res;
10341 sprintf(tempNum, "%" PRIu64 "", stageRadix);
10342 res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
10343 if (res != VKFFT_SUCCESS) return res;
10344 res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
10345 if (res != VKFFT_SUCCESS) return res;
10346 /*sc->tempLen = sprintf(sc->tempStr, "\
10347 stageInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") %% (%" PRIu64 ");\n\
10348 blockInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") - stageInvocationID;\n\
10349 inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
10350 for (uint64_t i = 0; i < stageRadix; i++) {
10351 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10352 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
10353 sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
10354 t++;
10355 sprintf(tempNum, "%" PRIu64 "", i * stageSize);
10356 res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum);
10357 if (res != VKFFT_SUCCESS) return res;
10358 res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID);
10359 if (res != VKFFT_SUCCESS) return res;
10360 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
10361 if (res != VKFFT_SUCCESS) return res;
10362 //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
10363 if (strcmp(stageNormalization, "")) {
10364 res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
10365 if (res != VKFFT_SUCCESS) return res;
10366 }
10367 res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
10368 if (res != VKFFT_SUCCESS) return res;
10369 /*sc->tempLen = sprintf(sc->tempStr, "\
10370 sdata[gl_WorkGroupSize.x*(inoutID+%" PRIu64 ")+gl_LocalInvocationID.x] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
10371 }
10372 }
10373 for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
10374 sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
10375 t++;
10376 }
10377 t = 0;
10378 if (sc->registerBoost > 1) {
10379 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim)
10380 {
10381 sc->tempLen = sprintf(sc->tempStr, " }\n");
10382 res = VkAppendLine(sc);
10383 if (res != VKFFT_SUCCESS) return res;
10384 }
10386 if (res != VKFFT_SUCCESS) return res;
10387 res = appendZeropadEnd(sc);
10388 if (res != VKFFT_SUCCESS) return res;
10389 res = appendBarrierVkFFT(sc, 2);
10390 if (res != VKFFT_SUCCESS) return res;
10391 res = appendZeropadStart(sc);
10392 if (res != VKFFT_SUCCESS) return res;
10394 if (res != VKFFT_SUCCESS) return res;
10395 if (sc->localSize[1] * logicalStoragePerThreadNext > sc->fftDim) {
10396 sc->tempLen = sprintf(sc->tempStr, "\
10397 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThreadNext, sc->fftDim);
10398 res = VkAppendLine(sc);
10399 if (res != VKFFT_SUCCESS) return res;
10400 }
10401 for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
10402 for (uint64_t i = 0; i < stageRadixNext; i++) {
10403 uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalRegistersPerThreadNext / stageRadixNext;
10404 id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext;
10405 sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext);
10406 res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_y, tempNum);
10407 if (res != VKFFT_SUCCESS) return res;
10408 res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID);
10409 if (res != VKFFT_SUCCESS) return res;
10410 res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
10411 if (res != VKFFT_SUCCESS) return res;
10412 //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext);
10413 res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID);
10414 if (res != VKFFT_SUCCESS) return res;
10415 /*sc->tempLen = sprintf(sc->tempStr, "\
10416 temp%s = sdata[gl_WorkGroupSize.x*(gl_LocalInvocationID.y+%" PRIu64 ")+gl_LocalInvocationID.x];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/
10417 t++;
10418 }
10419 }
10420 if (sc->localSize[1] * logicalStoragePerThreadNext > sc->fftDim)
10421 {
10422 sc->tempLen = sprintf(sc->tempStr, " }\n");
10423 res = VkAppendLine(sc);
10424 if (res != VKFFT_SUCCESS) return res;
10425 }
10427 if (res != VKFFT_SUCCESS) return res;
10428 res = appendZeropadEnd(sc);
10429 if (res != VKFFT_SUCCESS) return res;
10430 }
10431 else {
10432 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim)
10433 {
10434 sc->tempLen = sprintf(sc->tempStr, " }\n");
10435 res = VkAppendLine(sc);
10436 if (res != VKFFT_SUCCESS) return res;
10437 }
10439 if (res != VKFFT_SUCCESS) return res;
10440 res = appendZeropadEnd(sc);
10441 if (res != VKFFT_SUCCESS) return res;
10442 }
10443 }
10444 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10445 sprintf(sc->regIDs[i], "%s", tempID[i]);
10446 }
10447 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10448 free(tempID[i]);
10449 tempID[i] = 0;
10450 }
10451 free(tempID);
10452 tempID = 0;
10453 }
10454 else
10456 }
10457 else {
10458 char** tempID;
10459 tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
10460 if (tempID) {
10461 //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
10462 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10463 tempID[i] = (char*)malloc(sizeof(char) * 50);
10464 if (!tempID[i]) {
10465 for (uint64_t j = 0; j < i; j++) {
10466 free(tempID[j]);
10467 tempID[j] = 0;
10468 }
10469 free(tempID);
10470 tempID = 0;
10472 }
10473 }
10474 for (uint64_t k = 0; k < sc->registerBoost; ++k) {
10475 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10476 for (uint64_t i = 0; i < stageRadix; i++) {
10477 uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10478 id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
10479 sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
10480 }
10481 }
10482 for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
10483 sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]);
10484 }
10485 }
10486 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10487 sprintf(sc->regIDs[i], "%s", tempID[i]);
10488 }
10489 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
10490 free(tempID[i]);
10491 tempID[i] = 0;
10492 }
10493 free(tempID);
10494 tempID = 0;
10495 }
10496 else
10498 }
10499 }
10500 else {
10501 res = appendZeropadStart(sc);
10502 if (res != VKFFT_SUCCESS) return res;
10504 if (res != VKFFT_SUCCESS) return res;
10505 if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
10506 sc->tempLen = sprintf(sc->tempStr, "\
10507 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
10508 res = VkAppendLine(sc);
10509 if (res != VKFFT_SUCCESS) return res;
10510 }
10511 if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
10512 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
10513 if (strcmp(stageNormalization, "")) {
10514 res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);
10515 }
10516 if (res != VKFFT_SUCCESS) return res;
10517 }
10518 }
10519 if (sc->localSize[1] * logicalRegistersPerThread > sc->fftDim)
10520 {
10521 sc->tempLen = sprintf(sc->tempStr, " }\n");
10522 res = VkAppendLine(sc);
10523 if (res != VKFFT_SUCCESS) return res;
10524 }
10526 if (res != VKFFT_SUCCESS) return res;
10527 res = appendZeropadEnd(sc);
10528 if (res != VKFFT_SUCCESS) return res;
10529 }
10530 return res;
10531}
10532static inline VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t shuffleType) {
10534 switch (shuffleType) {
10535 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
10536 res = appendRadixShuffleNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext);
10537 if (res != VKFFT_SUCCESS) return res;
10538 //appendBarrierVkFFT(sc, 1);
10539 break;
10540 }
10541 case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
10542 res = appendRadixShuffleStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext);
10543 if (res != VKFFT_SUCCESS) return res;
10544 //appendBarrierVkFFT(sc, 1);
10545 break;
10546 }
10547 }
10548 return res;
10549}
10550
10551static inline VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t shuffleType, uint64_t start) {
10553 switch (shuffleType) {
10554 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
10555 uint64_t logicalStoragePerThread;
10556 if (start == 1) {
10557 logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10558 }
10559 else {
10560 logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10561 }
10562 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
10563 if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
10564 for (uint64_t k = 0; k < sc->registerBoost; k++) {
10565 if (k > 0) {
10566 res = appendBarrierVkFFT(sc, 2);
10567 if (res != VKFFT_SUCCESS) return res;
10568 }
10569 res = appendZeropadStart(sc);
10570 if (res != VKFFT_SUCCESS) return res;
10572 if (res != VKFFT_SUCCESS) return res;
10573 if (start == 0) {
10574 sc->tempLen = sprintf(sc->tempStr, "\
10575 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
10576 res = VkAppendLine(sc);
10577 if (res != VKFFT_SUCCESS) return res;
10578 for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
10579 sc->tempLen = sprintf(sc->tempStr, "\
10580 sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
10581 res = VkAppendLine(sc);
10582 if (res != VKFFT_SUCCESS) return res;
10583 }
10584 sc->tempLen = sprintf(sc->tempStr, " }\n");
10585 res = VkAppendLine(sc);
10586 if (res != VKFFT_SUCCESS) return res;
10587 }
10588 else
10589 {
10590 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
10591 sc->tempLen = sprintf(sc->tempStr, "\
10592 sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
10593 res = VkAppendLine(sc);
10594 if (res != VKFFT_SUCCESS) return res;
10595 }
10596 }
10598 if (res != VKFFT_SUCCESS) return res;
10599 res = appendZeropadEnd(sc);
10600 if (res != VKFFT_SUCCESS) return res;
10601 res = appendBarrierVkFFT(sc, 2);
10602 if (res != VKFFT_SUCCESS) return res;
10603 res = appendZeropadStart(sc);
10604 if (res != VKFFT_SUCCESS) return res;
10606 if (res != VKFFT_SUCCESS) return res;
10607 if (start == 1) {
10608 sc->tempLen = sprintf(sc->tempStr, "\
10609 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
10610 res = VkAppendLine(sc);
10611 if (res != VKFFT_SUCCESS) return res;
10612 for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
10613 sc->tempLen = sprintf(sc->tempStr, "\
10614 %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * logicalGroupSize);
10615 res = VkAppendLine(sc);
10616 if (res != VKFFT_SUCCESS) return res;
10617 }
10618 sc->tempLen = sprintf(sc->tempStr, " }\n");
10619 res = VkAppendLine(sc);
10620 if (res != VKFFT_SUCCESS) return res;
10621 }
10622 else {
10623 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
10624 sc->tempLen = sprintf(sc->tempStr, "\
10625 %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
10626 res = VkAppendLine(sc);
10627 if (res != VKFFT_SUCCESS) return res;
10628 }
10629 }
10631 if (res != VKFFT_SUCCESS) return res;
10632 res = appendZeropadEnd(sc);
10633 if (res != VKFFT_SUCCESS) return res;
10634 }
10635 }
10636
10637 break;
10638 }
10639 case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
10640 uint64_t logicalStoragePerThread;
10641 if (start == 1) {
10642 logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10643 }
10644 else {
10645 logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
10646 }
10647 uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
10648 if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
10649 for (uint64_t k = 0; k < sc->registerBoost; k++) {
10650 if (k > 0) {
10651 res = appendBarrierVkFFT(sc, 2);
10652 if (res != VKFFT_SUCCESS) return res;
10653 }
10654 res = appendZeropadStart(sc);
10655 if (res != VKFFT_SUCCESS) return res;
10657 if (res != VKFFT_SUCCESS) return res;
10658 if (start == 0) {
10659 sc->tempLen = sprintf(sc->tempStr, "\
10660 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
10661 res = VkAppendLine(sc);
10662 if (res != VKFFT_SUCCESS) return res;
10663 for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
10664 sc->tempLen = sprintf(sc->tempStr, "\
10665 sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
10666 res = VkAppendLine(sc);
10667 if (res != VKFFT_SUCCESS) return res;
10668 }
10669 sc->tempLen = sprintf(sc->tempStr, " }\n");
10670 res = VkAppendLine(sc);
10671 if (res != VKFFT_SUCCESS) return res;
10672 }
10673 else
10674 {
10675 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
10676 sc->tempLen = sprintf(sc->tempStr, "\
10677 sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->regIDs[i + k * sc->registers_per_thread]);
10678 res = VkAppendLine(sc);
10679 if (res != VKFFT_SUCCESS) return res;
10680 }
10681 }
10683 if (res != VKFFT_SUCCESS) return res;
10684 res = appendZeropadEnd(sc);
10685 if (res != VKFFT_SUCCESS) return res;
10686 res = appendBarrierVkFFT(sc, 2);
10687 if (res != VKFFT_SUCCESS) return res;
10688 res = appendZeropadStart(sc);
10689 if (res != VKFFT_SUCCESS) return res;
10691 if (res != VKFFT_SUCCESS) return res;
10692 if (start == 1) {
10693 sc->tempLen = sprintf(sc->tempStr, "\
10694 if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
10695 res = VkAppendLine(sc);
10696 if (res != VKFFT_SUCCESS) return res;
10697 for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
10698 sc->tempLen = sprintf(sc->tempStr, "\
10699 %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize);
10700 res = VkAppendLine(sc);
10701 if (res != VKFFT_SUCCESS) return res;
10702 }
10703 sc->tempLen = sprintf(sc->tempStr, " }\n");
10704 res = VkAppendLine(sc);
10705 if (res != VKFFT_SUCCESS) return res;
10706 }
10707 else {
10708 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
10709 sc->tempLen = sprintf(sc->tempStr, "\
10710 %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
10711 res = VkAppendLine(sc);
10712 if (res != VKFFT_SUCCESS) return res;
10713 }
10714 }
10716 if (res != VKFFT_SUCCESS) return res;
10717 res = appendZeropadEnd(sc);
10718 if (res != VKFFT_SUCCESS) return res;
10719 }
10720 }
10721
10722 break;
10723 }
10724 }
10725 return res;
10726}
10727
10730 if ((!sc->writeFromRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) {
10731 switch (readType) {
10732 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c
10733 {
10734 res = appendBarrierVkFFT(sc, 1);
10735 if (res != VKFFT_SUCCESS) return res;
10736 res = appendZeropadStart(sc);
10737 if (res != VKFFT_SUCCESS) return res;
10739 if (res != VKFFT_SUCCESS) return res;
10740 if (sc->matrixConvolution == 1) {
10741 sc->tempLen = sprintf(sc->tempStr, "\
10742 %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10743 res = VkAppendLine(sc);
10744 if (res != VKFFT_SUCCESS) return res;
10745 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10746 sc->tempLen = sprintf(sc->tempStr, "\
10747 %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
10748 res = VkAppendLine(sc);
10749 if (res != VKFFT_SUCCESS) return res;
10750 }
10751 //appendBarrierVkFFT(sc, 3);
10752 }
10753 else {
10754 sc->tempLen = sprintf(sc->tempStr, "\
10755 switch (coordinate) {\n\
10756 case 0:\n");
10757 res = VkAppendLine(sc);
10758 if (res != VKFFT_SUCCESS) return res;
10759 sc->tempLen = sprintf(sc->tempStr, "\
10760 %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10761 res = VkAppendLine(sc);
10762 if (res != VKFFT_SUCCESS) return res;
10763 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10764 sc->tempLen = sprintf(sc->tempStr, "\
10765 %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
10766 res = VkAppendLine(sc);
10767 if (res != VKFFT_SUCCESS) return res;
10768 }
10769 //appendBarrierVkFFT(sc, 3);
10770 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10771 res = VkAppendLine(sc);
10772 if (res != VKFFT_SUCCESS) return res;
10773 for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
10774 sc->tempLen = sprintf(sc->tempStr, "\
10775 case %" PRIu64 ":\n", i);
10776 res = VkAppendLine(sc);
10777 if (res != VKFFT_SUCCESS) return res;
10778 sc->tempLen = sprintf(sc->tempStr, "\
10779 %s_%" PRIu64 " = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10780 res = VkAppendLine(sc);
10781 if (res != VKFFT_SUCCESS) return res;
10782 for (uint64_t j = 1; j < sc->min_registers_per_thread; j++) {
10783 sc->tempLen = sprintf(sc->tempStr, "\
10784 %s_%" PRIu64 " = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[j], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x);
10785 res = VkAppendLine(sc);
10786 if (res != VKFFT_SUCCESS) return res;
10787 }
10788 //appendBarrierVkFFT(sc, 3);
10789 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10790 res = VkAppendLine(sc);
10791 if (res != VKFFT_SUCCESS) return res;
10792 }
10793 sc->tempLen = sprintf(sc->tempStr, " }\n");
10794 res = VkAppendLine(sc);
10795 if (res != VKFFT_SUCCESS) return res;
10796 }
10798 if (res != VKFFT_SUCCESS) return res;
10799 res = appendZeropadEnd(sc);
10800 if (res != VKFFT_SUCCESS) return res;
10801 break;
10802 }
10803 case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c
10804 {
10805 res = appendBarrierVkFFT(sc, 1);
10806 if (res != VKFFT_SUCCESS) return res;
10807 res = appendZeropadStart(sc);
10808 if (res != VKFFT_SUCCESS) return res;
10810 if (res != VKFFT_SUCCESS) return res;
10811 if (sc->matrixConvolution == 1) {
10812 sc->tempLen = sprintf(sc->tempStr, "\
10813 %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10814 res = VkAppendLine(sc);
10815 if (res != VKFFT_SUCCESS) return res;
10816 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10817 sc->tempLen = sprintf(sc->tempStr, "\
10818 %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
10819 res = VkAppendLine(sc);
10820 if (res != VKFFT_SUCCESS) return res;
10821 }
10822 //appendBarrierVkFFT(sc, 3);
10823 }
10824 else {
10825 sc->tempLen = sprintf(sc->tempStr, "\
10826 switch (coordinate) {\n\
10827 case 0:\n");
10828 res = VkAppendLine(sc);
10829 if (res != VKFFT_SUCCESS) return res;
10830 sc->tempLen = sprintf(sc->tempStr, "\
10831 %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10832 res = VkAppendLine(sc);
10833 if (res != VKFFT_SUCCESS) return res;
10834 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10835 sc->tempLen = sprintf(sc->tempStr, "\
10836 %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
10837 res = VkAppendLine(sc);
10838 if (res != VKFFT_SUCCESS) return res;
10839 }
10840 //appendBarrierVkFFT(sc, 3);
10841 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10842 res = VkAppendLine(sc);
10843 if (res != VKFFT_SUCCESS) return res;
10844 for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
10845 sc->tempLen = sprintf(sc->tempStr, "\
10846 case %" PRIu64 ":\n", i);
10847 res = VkAppendLine(sc);
10848 if (res != VKFFT_SUCCESS) return res;
10849 sc->tempLen = sprintf(sc->tempStr, "\
10850 %s_%" PRIu64 " = sdata[%s*(%s)+%s];\n", sc->regIDs[0], i, sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
10851 res = VkAppendLine(sc);
10852 if (res != VKFFT_SUCCESS) return res;
10853 for (uint64_t j = 1; j < sc->min_registers_per_thread; j++) {
10854 sc->tempLen = sprintf(sc->tempStr, "\
10855 %s_%" PRIu64 " = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[j], i, sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
10856 res = VkAppendLine(sc);
10857 if (res != VKFFT_SUCCESS) return res;
10858 }
10859 //appendBarrierVkFFT(sc, 3);
10860 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10861 res = VkAppendLine(sc);
10862 if (res != VKFFT_SUCCESS) return res;
10863 }
10864 sc->tempLen = sprintf(sc->tempStr, " }\n");
10865 res = VkAppendLine(sc);
10866 if (res != VKFFT_SUCCESS) return res;
10867 }
10869 if (res != VKFFT_SUCCESS) return res;
10870 res = appendZeropadEnd(sc);
10871 if (res != VKFFT_SUCCESS) return res;
10872 break;
10873 }
10874 }
10875 }
10876 return res;
10877}
10880 if ((!sc->readToRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) {
10881 switch (readType) {
10882 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c
10883 {
10884 res = appendBarrierVkFFT(sc, 1);
10885 if (res != VKFFT_SUCCESS) return res;
10886 res = appendZeropadStart(sc);
10887 if (res != VKFFT_SUCCESS) return res;
10889 if (res != VKFFT_SUCCESS) return res;
10890 if (sc->matrixConvolution == 1) {
10891 sc->tempLen = sprintf(sc->tempStr, "\
10892 sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
10893 res = VkAppendLine(sc);
10894 if (res != VKFFT_SUCCESS) return res;
10895 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10896 sc->tempLen = sprintf(sc->tempStr, "\
10897 sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
10898 res = VkAppendLine(sc);
10899 if (res != VKFFT_SUCCESS) return res;
10900 }
10901 //appendBarrierVkFFT(sc, 3);
10902 }
10903 else {
10904 sc->tempLen = sprintf(sc->tempStr, "\
10905 switch (coordinate) {\n\
10906 case 0:\n");
10907 res = VkAppendLine(sc);
10908 if (res != VKFFT_SUCCESS) return res;
10909 sc->tempLen = sprintf(sc->tempStr, "\
10910 sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
10911 res = VkAppendLine(sc);
10912 if (res != VKFFT_SUCCESS) return res;
10913 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10914 sc->tempLen = sprintf(sc->tempStr, "\
10915 sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
10916 res = VkAppendLine(sc);
10917 if (res != VKFFT_SUCCESS) return res;
10918 }
10919 //appendBarrierVkFFT(sc, 3);
10920 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10921 res = VkAppendLine(sc);
10922 if (res != VKFFT_SUCCESS) return res;
10923 for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
10924 sc->tempLen = sprintf(sc->tempStr, "\
10925 case %" PRIu64 ":\n", i);
10926 res = VkAppendLine(sc);
10927 if (res != VKFFT_SUCCESS) return res;
10928 sc->tempLen = sprintf(sc->tempStr, "\
10929 sdata[sharedStride * %s + %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i);
10930 res = VkAppendLine(sc);
10931 if (res != VKFFT_SUCCESS) return res;
10932 for (uint64_t j = 1; j < sc->min_registers_per_thread; j++) {
10933 sc->tempLen = sprintf(sc->tempStr, "\
10934 sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x, sc->regIDs[j], i);
10935 res = VkAppendLine(sc);
10936 if (res != VKFFT_SUCCESS) return res;
10937 }
10938 //appendBarrierVkFFT(sc, 3);
10939 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10940 res = VkAppendLine(sc);
10941 if (res != VKFFT_SUCCESS) return res;
10942 }
10943 sc->tempLen = sprintf(sc->tempStr, " }\n");
10944 res = VkAppendLine(sc);
10945 if (res != VKFFT_SUCCESS) return res;
10946 }
10948 if (res != VKFFT_SUCCESS) return res;
10949 res = appendZeropadEnd(sc);
10950 if (res != VKFFT_SUCCESS) return res;
10951 break;
10952 }
10953 case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c
10954 {
10955 res = appendBarrierVkFFT(sc, 1);
10956 if (res != VKFFT_SUCCESS) return res;
10957 res = appendZeropadStart(sc);
10958 if (res != VKFFT_SUCCESS) return res;
10960 if (res != VKFFT_SUCCESS) return res;
10961 if (sc->matrixConvolution == 1) {
10962 sc->tempLen = sprintf(sc->tempStr, "\
10963 sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
10964 res = VkAppendLine(sc);
10965 if (res != VKFFT_SUCCESS) return res;
10966 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10967 sc->tempLen = sprintf(sc->tempStr, "\
10968 sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]);
10969 res = VkAppendLine(sc);
10970 if (res != VKFFT_SUCCESS) return res;
10971 }
10972 //appendBarrierVkFFT(sc, 3);
10973 }
10974 else {
10975 sc->tempLen = sprintf(sc->tempStr, "\
10976 switch (coordinate) {\n\
10977 case 0:\n");
10978 res = VkAppendLine(sc);
10979 if (res != VKFFT_SUCCESS) return res;
10980 sc->tempLen = sprintf(sc->tempStr, "\
10981 sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
10982 res = VkAppendLine(sc);
10983 if (res != VKFFT_SUCCESS) return res;
10984 for (uint64_t i = 1; i < sc->min_registers_per_thread; i++) {
10985 sc->tempLen = sprintf(sc->tempStr, "\
10986 sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]);
10987 res = VkAppendLine(sc);
10988 if (res != VKFFT_SUCCESS) return res;
10989 }
10990 //appendBarrierVkFFT(sc, 3);
10991 sc->tempLen = sprintf(sc->tempStr, " break;\n");
10992 res = VkAppendLine(sc);
10993 if (res != VKFFT_SUCCESS) return res;
10994 for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
10995 sc->tempLen = sprintf(sc->tempStr, "\
10996 case %" PRIu64 ":\n", i);
10997 res = VkAppendLine(sc);
10998 if (res != VKFFT_SUCCESS) return res;
10999 sc->tempLen = sprintf(sc->tempStr, "\
11000 sdata[%s*(%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i);
11001 res = VkAppendLine(sc);
11002 if (res != VKFFT_SUCCESS) return res;
11003 for (uint64_t j = 1; j < sc->min_registers_per_thread; j++) {
11004 sc->tempLen = sprintf(sc->tempStr, "\
11005 sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[j], i);
11006 res = VkAppendLine(sc);
11007 if (res != VKFFT_SUCCESS) return res;
11008 }
11009 //appendBarrierVkFFT(sc, 3);
11010 sc->tempLen = sprintf(sc->tempStr, " break;\n");
11011 res = VkAppendLine(sc);
11012 if (res != VKFFT_SUCCESS) return res;
11013 }
11014 sc->tempLen = sprintf(sc->tempStr, " }\n");
11015 res = VkAppendLine(sc);
11016 if (res != VKFFT_SUCCESS) return res;
11017 }
11019 if (res != VKFFT_SUCCESS) return res;
11020 res = appendZeropadEnd(sc);
11021 if (res != VKFFT_SUCCESS) return res;
11022 break;
11023 }
11024 }
11025 }
11026 return res;
11027}
11028static inline VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) {
11030 char vecType[30];
11031#if(VKFFT_BACKEND==0)
11032 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
11033 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
11034#elif(VKFFT_BACKEND==1)
11035 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11036 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11037#elif(VKFFT_BACKEND==2)
11038 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11039 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11040#elif(VKFFT_BACKEND==3)
11041 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11042 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11043#endif
11044 char separateRegisterStore[100] = "_store";
11045
11046 for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
11047 sc->tempLen = sprintf(sc->tempStr, " %s %s%s;\n", vecType, sc->regIDs[i], separateRegisterStore);
11048 res = VkAppendLine(sc);
11049 if (res != VKFFT_SUCCESS) return res;
11050 for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
11051 sc->tempLen = sprintf(sc->tempStr, " %s %s_%" PRIu64 "%s;\n", vecType, sc->regIDs[i], j, separateRegisterStore);
11052 res = VkAppendLine(sc);
11053 if (res != VKFFT_SUCCESS) return res;
11054 }
11055 }
11056 for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
11057 //sc->tempLen = sprintf(sc->tempStr, " temp%s[i]=temp[i];\n", separateRegisterStore);
11058 sc->tempLen = sprintf(sc->tempStr, " %s%s=%s;\n", sc->regIDs[i], separateRegisterStore, sc->regIDs[i]);
11059 res = VkAppendLine(sc);
11060 if (res != VKFFT_SUCCESS) return res;
11061 for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
11062 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 "%s=%s_%" PRIu64 ";\n", sc->regIDs[i], j, separateRegisterStore, sc->regIDs[i], j);
11063 res = VkAppendLine(sc);
11064 if (res != VKFFT_SUCCESS) return res;
11065 }
11066 }
11067 sc->tempLen = sprintf(sc->tempStr, " for (%s batchID=0; batchID < %" PRIu64 "; batchID++){\n", uintType, sc->numKernels);
11068 res = VkAppendLine(sc);
11069 if (res != VKFFT_SUCCESS) return res;
11070 return res;
11071}
11072static inline VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) {
11074 char shiftX[500] = "";
11075 if (sc->performWorkGroupShift[0])
11076 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
11077 char requestCoordinate[100] = "";
11078 if (sc->convolutionStep) {
11079 if (sc->matrixConvolution > 1) {
11080 sprintf(requestCoordinate, "0");
11081 }
11082 }
11083 char index_x[2000] = "";
11084 char index_y[2000] = "";
11085 char requestBatch[100] = "";
11086 char separateRegisterStore[100] = "";
11087 if (sc->convolutionStep) {
11088 if (sc->numKernels > 1) {
11089 sprintf(requestBatch, "batchID");
11090 sprintf(separateRegisterStore, "_store");
11091 }
11092 }
11093 res = appendZeropadStart(sc);
11094 if (res != VKFFT_SUCCESS) return res;
11096 if (res != VKFFT_SUCCESS) return res;
11097 for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
11098 sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j);
11099 res = VkAppendLine(sc);
11100 if (res != VKFFT_SUCCESS) return res;
11101 sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j);
11102 res = VkAppendLine(sc);
11103 if (res != VKFFT_SUCCESS) return res;
11104 }
11105 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
11106 switch (dataType) {
11107 case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
11108 {
11109 if (sc->fftDim == sc->fft_dim_full) {
11110 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
11111 res = VkAppendLine(sc);
11112 if (res != VKFFT_SUCCESS) return res;
11113 }
11114 else {
11115 sc->tempLen = sprintf(sc->tempStr, " %s = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
11116 res = VkAppendLine(sc);
11117 if (res != VKFFT_SUCCESS) return res;
11118 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
11119 }
11120 break;
11121 }
11122 case 1: case 111: case 121: case 131: case 141: case 143: case 145:
11123 {
11124 if (sc->fftDim == sc->fft_dim_full) {
11125 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
11126 res = VkAppendLine(sc);
11127 if (res != VKFFT_SUCCESS) return res;
11128 }
11129 else {
11130 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
11131 res = VkAppendLine(sc);
11132 if (res != VKFFT_SUCCESS) return res;
11133 }
11134 break;
11135 }
11136 }
11137 char kernelName[100] = "";
11138 sprintf(kernelName, "BluesteinConvolutionKernel");
11139 if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full))
11140 sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x + %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
11141 else
11142 sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x - %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
11143
11144 res = VkAppendLine(sc);
11145 if (res != VKFFT_SUCCESS) return res;
11146
11147 if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full))
11148 sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y - %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
11149 else
11150 sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y + %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
11151 res = VkAppendLine(sc);
11152 if (res != VKFFT_SUCCESS) return res;
11153 sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]);
11154 res = VkAppendLine(sc);
11155 if (res != VKFFT_SUCCESS) return res;
11156 sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]);
11157 res = VkAppendLine(sc);
11158 if (res != VKFFT_SUCCESS) return res;
11159 }
11161 if (res != VKFFT_SUCCESS) return res;
11162 res = appendZeropadEnd(sc);
11163 if (res != VKFFT_SUCCESS) return res;
11164 return res;
11165}
11166
11167static inline VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) {
11169 char shiftX[500] = "";
11170 if (sc->performWorkGroupShift[0])
11171 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
11172 char requestCoordinate[100] = "";
11173 if (sc->convolutionStep) {
11174 if (sc->matrixConvolution > 1) {
11175 sprintf(requestCoordinate, "0");
11176 }
11177 }
11178 char index_x[2000] = "";
11179 char index_y[2000] = "";
11180 char requestBatch[100] = "";
11181 char separateRegisterStore[100] = "";
11182 if (sc->convolutionStep) {
11183 if (sc->numKernels > 1) {
11184 sprintf(requestBatch, "batchID");
11185 sprintf(separateRegisterStore, "_store");
11186 }
11187 }
11188 res = appendZeropadStart(sc);
11189 if (res != VKFFT_SUCCESS) return res;
11191 if (res != VKFFT_SUCCESS) return res;
11192 for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
11193 sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j);
11194 res = VkAppendLine(sc);
11195 if (res != VKFFT_SUCCESS) return res;
11196 sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j);
11197 res = VkAppendLine(sc);
11198 if (res != VKFFT_SUCCESS) return res;
11199 }
11200 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
11201 if (i > 0) {
11202 for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
11203 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " = 0;\n", j);
11204 res = VkAppendLine(sc);
11205 if (res != VKFFT_SUCCESS) return res;
11206 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " = 0;\n", j);
11207 res = VkAppendLine(sc);
11208 if (res != VKFFT_SUCCESS) return res;
11209 }
11210 }
11211 switch (dataType) {
11212 case 0:
11213 {
11214 if (sc->fftDim == sc->fft_dim_full) {
11215 if (sc->localSize[1] == 1)
11216 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
11217 else
11218 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, i * sc->localSize[0] * sc->localSize[1]);
11219 res = VkAppendLine(sc);
11220 if (res != VKFFT_SUCCESS) return res;
11221
11222 if (sc->inputStride[0] > 1) {
11223 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11224 res = VkAppendLine(sc);
11225 if (res != VKFFT_SUCCESS) return res;
11226 sprintf(index_x, "(combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]);
11227 uint64_t tempSaveInputOffset = sc->inputOffset;
11228 uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
11229 sc->inputOffset = sc->kernelOffset;
11231 res = indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11232 if (res != VKFFT_SUCCESS) return res;
11233 sc->inputOffset = tempSaveInputOffset;
11234 sc->inputNumberByteSize = tempSaveInputNumberByteSize;
11235 sc->tempLen = sprintf(sc->tempStr, ";\n");
11236 res = VkAppendLine(sc);
11237 if (res != VKFFT_SUCCESS) return res;
11238 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch);
11239 }
11240 else {
11241 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11242 res = VkAppendLine(sc);
11243 if (res != VKFFT_SUCCESS) return res;
11244 sprintf(index_x, "(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->fftDim, sc->inputStride[1]);
11245 uint64_t tempSaveInputOffset = sc->inputOffset;
11246 uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
11247 sc->inputOffset = sc->kernelOffset;
11249 res = indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11250 if (res != VKFFT_SUCCESS) return res;
11251 sc->inputOffset = tempSaveInputOffset;
11252 sc->inputNumberByteSize = tempSaveInputNumberByteSize;
11253 sc->tempLen = sprintf(sc->tempStr, ";\n");
11254 res = VkAppendLine(sc);
11255 if (res != VKFFT_SUCCESS) return res;
11256 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch);
11257 }
11258 }
11259 else {
11260 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11261 res = VkAppendLine(sc);
11262 if (res != VKFFT_SUCCESS) return res;
11263 sprintf(index_x, "%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
11264 uint64_t tempSaveInputOffset = sc->inputOffset;
11265 uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
11266 sc->inputOffset = sc->kernelOffset;
11268 res = indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11269 if (res != VKFFT_SUCCESS) return res;
11270 sc->inputOffset = tempSaveInputOffset;
11271 sc->inputNumberByteSize = tempSaveInputNumberByteSize;
11272 sc->tempLen = sprintf(sc->tempStr, ";\n");
11273 res = VkAppendLine(sc);
11274 if (res != VKFFT_SUCCESS) return res;
11275 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
11276 }
11277 break;
11278 }
11279 case 1:
11280 {
11281 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11282 res = VkAppendLine(sc);
11283 if (res != VKFFT_SUCCESS) return res;
11284 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
11285 sprintf(index_y, "(%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")", sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim);
11286 uint64_t tempSaveInputOffset = sc->inputOffset;
11287 uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
11288 sc->inputOffset = sc->kernelOffset;
11290 res = indexInputVkFFT(sc, uintType, dataType, index_x, index_y, requestCoordinate, requestBatch);
11291 if (res != VKFFT_SUCCESS) return res;
11292 sc->inputOffset = tempSaveInputOffset;
11293 sc->inputNumberByteSize = tempSaveInputNumberByteSize;
11294 sc->tempLen = sprintf(sc->tempStr, ";\n");
11295 res = VkAppendLine(sc);
11296 if (res != VKFFT_SUCCESS) return res;
11297 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((%s%s) %% (%" PRIu64 "), (%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim, requestCoordinate, requestBatch);
11298 break;
11299 }
11300 }
11301 char kernelName[100] = "";
11302 sprintf(kernelName, "kernel_obj");
11303 if ((sc->kernelBlockNum == 1) || (sc->useBluesteinFFT)) {
11304 for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
11305 for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
11306 uint64_t k = 0;
11307 if (sc->symmetricKernel) {
11308 k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
11309 }
11310 else {
11311 k = (j * sc->matrixConvolution + l);
11312 }
11313 if (sc->conjugateConvolution == 0) {
11314 if (l == 0)
11315 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s%s.x - %s[inoutID+%" PRIu64 "].y * %s%s.y;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore);
11316 else
11317 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s_%" PRIu64 "%s.x - %s[inoutID+%" PRIu64 "].y * %s_%" PRIu64 "%s.y;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore);
11318 }
11319 else {
11320 if (l == 0)
11321 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s%s.x + %s[inoutID+%" PRIu64 "].y * %s%s.y;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore);
11322 else
11323 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s_%" PRIu64 "%s.x + %s[inoutID+%" PRIu64 "].y * %s_%" PRIu64 "%s.y;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore);
11324 }
11325 res = VkAppendLine(sc);
11326 if (res != VKFFT_SUCCESS) return res;
11327 }
11328 for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
11329 uint64_t k = 0;
11330 if (sc->symmetricKernel) {
11331 k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
11332 }
11333 else {
11334 k = (j * sc->matrixConvolution + l);
11335 }
11336 if (sc->conjugateConvolution == 0) {
11337 if (l == 0)
11338 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s%s.y + %s[inoutID+%" PRIu64 "].y * %s%s.x;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore);
11339 else
11340 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s_%" PRIu64 "%s.y + %s[inoutID+%" PRIu64 "].y * %s_%" PRIu64 "%s.x;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore);
11341 }
11342 else {
11343 if (sc->conjugateConvolution == 1) {
11344 if (l == 0)
11345 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].y * %s%s.x - %s[inoutID+%" PRIu64 "].x * %s%s.y ;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore);
11346 else
11347 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].y * %s_%" PRIu64 "%s.x - %s[inoutID+%" PRIu64 "].x * %s_%" PRIu64 "%s.y;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore);
11348 }
11349 else {
11350 if (l == 0)
11351 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s%s.y - %s[inoutID+%" PRIu64 "].y * %s%s.x;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], separateRegisterStore);
11352 else
11353 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s[inoutID+%" PRIu64 "].x * %s_%" PRIu64 "%s.y - %s[inoutID+%" PRIu64 "].y * %s_%" PRIu64 "%s.x;\n", j, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore, kernelName, k * sc->inputStride[3], sc->regIDs[i], l, separateRegisterStore);
11354 }
11355 }
11356 res = VkAppendLine(sc);
11357 if (res != VKFFT_SUCCESS) return res;
11358
11359 }
11360 }
11362#if(VKFFT_BACKEND==0)
11363 sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11364#elif(VKFFT_BACKEND==1)
11365 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11366#elif(VKFFT_BACKEND==2)
11367 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11368#elif(VKFFT_BACKEND==3)
11369 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11370#endif
11371 res = VkAppendLine(sc);
11372 if (res != VKFFT_SUCCESS) return res;
11373 sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0 * w.x;\n", sc->regIDs[i]);
11374 res = VkAppendLine(sc);
11375 if (res != VKFFT_SUCCESS) return res;
11376 sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0 * w.x;\n", sc->regIDs[i]);
11377 res = VkAppendLine(sc);
11378 if (res != VKFFT_SUCCESS) return res;
11379 }
11380 else {
11381 sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]);
11382 res = VkAppendLine(sc);
11383 if (res != VKFFT_SUCCESS) return res;
11384 sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]);
11385 res = VkAppendLine(sc);
11386 if (res != VKFFT_SUCCESS) return res;
11387 }
11388 for (uint64_t l = 1; l < sc->matrixConvolution; l++) {
11390#if(VKFFT_BACKEND==0)
11391 sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
11392#elif(VKFFT_BACKEND==1)
11393 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
11394#elif(VKFFT_BACKEND==2)
11395 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
11396#elif(VKFFT_BACKEND==3)
11397 sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
11398#endif
11399 res = VkAppendLine(sc);
11400 if (res != VKFFT_SUCCESS) return res;
11401 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l);
11402 res = VkAppendLine(sc);
11403 if (res != VKFFT_SUCCESS) return res;
11404 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l);
11405 res = VkAppendLine(sc);
11406 if (res != VKFFT_SUCCESS) return res;
11407 }
11408 else {
11409 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l);
11410 res = VkAppendLine(sc);
11411 if (res != VKFFT_SUCCESS) return res;
11412 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l);
11413 res = VkAppendLine(sc);
11414 if (res != VKFFT_SUCCESS) return res;
11415 }
11416 }
11417 }
11418 else {
11419 for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
11420
11421 sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j);
11422 res = VkAppendLine(sc);
11423 if (res != VKFFT_SUCCESS) return res;
11424 for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
11425 uint64_t k = 0;
11426 if (sc->symmetricKernel) {
11427 k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
11428 }
11429 else {
11430 k = (j * sc->matrixConvolution + l);
11431 }
11432 if (l == 0)
11433 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x * %s%s.x - kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y * %s%s.y;\n", j, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], separateRegisterStore, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], separateRegisterStore);
11434 else
11435 sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x * %s_%" PRIu64 "%s.x - kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y * %s_%" PRIu64 "%s.y;\n", j, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], l, separateRegisterStore, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], l, separateRegisterStore);
11436 res = VkAppendLine(sc);
11437 if (res != VKFFT_SUCCESS) return res;
11438
11439 }
11440
11441 sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j);
11442 res = VkAppendLine(sc);
11443 if (res != VKFFT_SUCCESS) return res;
11444 for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
11445 uint64_t k = 0;
11446 if (sc->symmetricKernel) {
11447 k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
11448 }
11449 else {
11450 k = (j * sc->matrixConvolution + l);
11451 }
11452 if (l == 0)
11453 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x * %s%s.y + kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y * %s%s.x;\n", j, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], separateRegisterStore, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], separateRegisterStore);
11454 else
11455 sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x * %s_%" PRIu64 "%s.y + kernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y * %s_%" PRIu64 "%s.x;\n", j, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], l, separateRegisterStore, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, sc->regIDs[i], l, separateRegisterStore);
11456 res = VkAppendLine(sc);
11457 if (res != VKFFT_SUCCESS) return res;
11458 }
11459 }
11460 sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]);
11461 res = VkAppendLine(sc);
11462 if (res != VKFFT_SUCCESS) return res;
11463 sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]);
11464 res = VkAppendLine(sc);
11465 if (res != VKFFT_SUCCESS) return res;
11466 for (uint64_t l = 1; l < sc->matrixConvolution; l++) {
11467 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l);
11468 res = VkAppendLine(sc);
11469 if (res != VKFFT_SUCCESS) return res;
11470 sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l);
11471 res = VkAppendLine(sc);
11472 if (res != VKFFT_SUCCESS) return res;
11473 }
11474 }
11475 }
11477 if (res != VKFFT_SUCCESS) return res;
11478 res = appendZeropadEnd(sc);
11479 if (res != VKFFT_SUCCESS) return res;
11480 return res;
11481}
11484 switch (writeType) {
11485 case 0: //single_c2c
11486 {
11487 if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim)) {
11488 sc->writeFromRegisters = 0;
11489 }
11490 else
11491 sc->writeFromRegisters = 1;
11492 break;
11493 }
11494 case 1: //grouped_c2c
11495 {
11496 if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
11497 sc->writeFromRegisters = 0;
11498 res = appendBarrierVkFFT(sc, 1);
11499 if (res != VKFFT_SUCCESS) return res;
11500 }
11501 else
11502 sc->writeFromRegisters = 1;
11503 break;
11504 }
11505 case 2: //single_c2c_strided
11506 {
11507 if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
11508 sc->writeFromRegisters = 0;
11509 }
11510 else
11511 sc->writeFromRegisters = 1;
11512 break;
11513 }
11514 case 5://single_r2c
11515 {
11516 sc->writeFromRegisters = 0;
11517 break;
11518 }
11519 case 6: //single_c2r
11520 {
11521 if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim)) {
11522 sc->writeFromRegisters = 0;
11523 }
11524 else
11525 sc->writeFromRegisters = 1;
11526 break;
11527 }
11528 case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
11529 {
11530 sc->writeFromRegisters = 0;
11531 break;
11532 }
11533 }
11534 return res;
11535}
11536static inline VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t writeType) {
11538 double double_PI = 3.1415926535897932384626433832795;
11539 char vecType[30];
11540 char outputsStruct[20] = "";
11541 char LFending[4] = "";
11542 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
11543#if(VKFFT_BACKEND==0)
11544 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
11545 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
11546 if (sc->outputBufferBlockNum == 1)
11547 sprintf(outputsStruct, "outputs");
11548 else
11549 sprintf(outputsStruct, ".outputs");
11550 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
11551 char cosDef[20] = "cos";
11552 char sinDef[20] = "sin";
11553#elif(VKFFT_BACKEND==1)
11554 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11555 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11556 sprintf(outputsStruct, "outputs");
11557 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
11558 char cosDef[20] = "__cosf";
11559 char sinDef[20] = "__sinf";
11560#elif(VKFFT_BACKEND==2)
11561 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11562 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11563 sprintf(outputsStruct, "outputs");
11564 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
11565 char cosDef[20] = "__cosf";
11566 char sinDef[20] = "__sinf";
11567#elif(VKFFT_BACKEND==3)
11568 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
11569 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
11570 sprintf(outputsStruct, "outputs");
11571 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
11572 char cosDef[20] = "native_cos";
11573 char sinDef[20] = "native_sin";
11574#endif
11575 char convTypeLeft[20] = "";
11576 char convTypeRight[20] = "";
11577 if ((!strcmp(floatTypeMemory, "half")) && (strcmp(floatType, "half"))) {
11578 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11579 sprintf(convTypeLeft, "float16_t(");
11580 sprintf(convTypeRight, ")");
11581 }
11582 else {
11583 sprintf(convTypeLeft, "f16vec2(");
11584 sprintf(convTypeRight, ")");
11585 }
11586 }
11587 if ((!strcmp(floatTypeMemory, "float")) && (strcmp(floatType, "float"))) {
11588 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11589#if(VKFFT_BACKEND==0)
11590 sprintf(convTypeLeft, "float(");
11591 sprintf(convTypeRight, ")");
11592#elif(VKFFT_BACKEND==1)
11593 sprintf(convTypeLeft, "(float)");
11594 //sprintf(convTypeRight, "");
11595#elif(VKFFT_BACKEND==2)
11596 sprintf(convTypeLeft, "(float)");
11597 //sprintf(convTypeRight, "");
11598#elif(VKFFT_BACKEND==3)
11599 sprintf(convTypeLeft, "(float)");
11600 //sprintf(convTypeRight, "");
11601#endif
11602 }
11603 else {
11604#if(VKFFT_BACKEND==0)
11605 sprintf(convTypeLeft, "vec2(");
11606 sprintf(convTypeRight, ")");
11607#elif(VKFFT_BACKEND==1)
11608 sprintf(convTypeLeft, "conv_float2(");
11609 sprintf(convTypeRight, ")");
11610#elif(VKFFT_BACKEND==2)
11611 sprintf(convTypeLeft, "conv_float2(");
11612 sprintf(convTypeRight, ")");
11613#elif(VKFFT_BACKEND==3)
11614 sprintf(convTypeLeft, "conv_float2(");
11615 sprintf(convTypeRight, ")");
11616#endif
11617 }
11618 }
11619 if ((!strcmp(floatTypeMemory, "double")) && (strcmp(floatType, "double"))) {
11620 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11621#if(VKFFT_BACKEND==0)
11622 sprintf(convTypeLeft, "double(");
11623 sprintf(convTypeRight, ")");
11624#elif(VKFFT_BACKEND==1)
11625 sprintf(convTypeLeft, "(double)");
11626 //sprintf(convTypeRight, "");
11627#elif(VKFFT_BACKEND==2)
11628 sprintf(convTypeLeft, "(double)");
11629 //sprintf(convTypeRight, "");
11630#elif(VKFFT_BACKEND==3)
11631 sprintf(convTypeLeft, "(double)");
11632 //sprintf(convTypeRight, "");
11633#endif
11634 }
11635 else {
11636#if(VKFFT_BACKEND==0)
11637 sprintf(convTypeLeft, "dvec2(");
11638 sprintf(convTypeRight, ")");
11639#elif(VKFFT_BACKEND==1)
11640 sprintf(convTypeLeft, "conv_double2(");
11641 sprintf(convTypeRight, ")");
11642#elif(VKFFT_BACKEND==2)
11643 sprintf(convTypeLeft, "conv_double2(");
11644 sprintf(convTypeRight, ")");
11645#elif(VKFFT_BACKEND==3)
11646 sprintf(convTypeLeft, "conv_double2(");
11647 sprintf(convTypeRight, ")");
11648#endif
11649 }
11650 }
11651
11652 char index_x[2000] = "";
11653 char index_y[2000] = "";
11654 char requestCoordinate[100] = "";
11655 if (sc->convolutionStep) {
11656 if (sc->matrixConvolution > 1) {
11657 sprintf(requestCoordinate, "coordinate");
11658 }
11659 }
11660 char requestBatch[100] = "";
11661 if (sc->convolutionStep) {
11662 if (sc->numKernels > 1) {
11663 sprintf(requestBatch, "batchID");//if one buffer - multiple kernel convolution
11664 }
11665 }
11666 switch (writeType) {
11667 case 0: //single_c2c
11668 {
11669 if (!sc->writeFromRegisters) {
11670 res = appendBarrierVkFFT(sc, 1);
11671 if (res != VKFFT_SUCCESS) return res;
11672 }
11673 //res = appendZeropadStart(sc);
11674 //if (res != VKFFT_SUCCESS) return res;
11675 char shiftX[500] = "";
11676 if (sc->performWorkGroupShift[0])
11677 sprintf(shiftX, " + consts.workGroupShiftX ");
11678 char shiftY[500] = "";
11679 if (sc->axisSwapped) {
11680 if (sc->performWorkGroupShift[1])
11681 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
11682 }
11683 else {
11684 if (sc->performWorkGroupShift[1])
11685 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
11686 }
11687
11688 char shiftY2[100] = "";
11689 if (sc->performWorkGroupShift[1])
11690 sprintf(shiftY, " + consts.workGroupShiftY ");
11691 if (sc->fftDim < sc->fft_dim_full) {
11692 if (sc->axisSwapped) {
11693 if (!sc->reorderFourStep) {
11694 sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
11695 res = VkAppendLine(sc);
11696 if (res != VKFFT_SUCCESS) return res;
11697 }
11698 else {
11699 sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize);
11700 res = VkAppendLine(sc);
11701 if (res != VKFFT_SUCCESS) return res;
11702 }
11703 }
11704 else {
11705 if (!sc->reorderFourStep) {
11707 }
11708 else {
11709 sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize);
11710 res = VkAppendLine(sc);
11711 }
11712 if (res != VKFFT_SUCCESS) return res;
11713 }
11714
11715 }
11716 else {
11717 sc->tempLen = sprintf(sc->tempStr, " { \n");
11718 res = VkAppendLine(sc);
11719 if (res != VKFFT_SUCCESS) return res;
11720 }
11721 if (sc->reorderFourStep) {
11722 if (sc->fftDim == sc->fft_dim_full) {
11723 for (uint64_t k = 0; k < sc->registerBoost; k++) {
11724 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
11725 if (sc->localSize[1] == 1)
11726 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
11727 else
11728 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
11729 res = VkAppendLine(sc);
11730 if (res != VKFFT_SUCCESS) return res;
11731
11732 if (sc->outputStride[0] > 1)
11733 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]);
11734 else
11735 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]);
11736 res = VkAppendLine(sc);
11737 if (res != VKFFT_SUCCESS) return res;
11738 if (sc->axisSwapped) {
11739 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
11740 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
11741 res = VkAppendLine(sc);
11742 if (res != VKFFT_SUCCESS) return res;
11743 }
11744 }
11745 else {
11746 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
11747 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
11748 res = VkAppendLine(sc);
11749 if (res != VKFFT_SUCCESS) return res;
11750 }
11751 }
11752 if (sc->zeropadBluestein[1]) {
11753 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
11754 res = VkAppendLine(sc);
11755 if (res != VKFFT_SUCCESS) return res;
11756 }
11757 if (sc->zeropad[1]) {
11758 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
11759 res = VkAppendLine(sc);
11760 if (res != VKFFT_SUCCESS) return res;
11761 }
11762 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11763 res = VkAppendLine(sc);
11764 if (res != VKFFT_SUCCESS) return res;
11765 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
11766 if (res != VKFFT_SUCCESS) return res;
11767 sc->tempLen = sprintf(sc->tempStr, ";\n");
11768 res = VkAppendLine(sc);
11769 if (res != VKFFT_SUCCESS) return res;
11771 if (res != VKFFT_SUCCESS) return res;
11772 if (sc->writeFromRegisters) {
11773 if (sc->outputBufferBlockNum == 1)
11774 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11775 else
11776 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11777 res = VkAppendLine(sc);
11778 if (res != VKFFT_SUCCESS) return res;
11779 }
11780 else {
11781 if (sc->axisSwapped) {
11782 if (sc->outputBufferBlockNum == 1)
11783 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11784 else
11785 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11786 res = VkAppendLine(sc);
11787 if (res != VKFFT_SUCCESS) return res;
11788 }
11789 else {
11790 if (sc->outputBufferBlockNum == 1)
11791 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11792 else
11793 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11794 res = VkAppendLine(sc);
11795 if (res != VKFFT_SUCCESS) return res;
11796 }
11797 }
11799 if (res != VKFFT_SUCCESS) return res;
11800 if (sc->zeropad[1]) {
11801 sc->tempLen = sprintf(sc->tempStr, " }\n");
11802 res = VkAppendLine(sc);
11803 if (res != VKFFT_SUCCESS) return res;
11804 }
11805 if (sc->zeropadBluestein[1]) {
11806 sc->tempLen = sprintf(sc->tempStr, " }\n");
11807 res = VkAppendLine(sc);
11808 if (res != VKFFT_SUCCESS) return res;
11809 }
11810 if (sc->axisSwapped) {
11811 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
11812 sc->tempLen = sprintf(sc->tempStr, " }");
11813 res = VkAppendLine(sc);
11814 if (res != VKFFT_SUCCESS) return res;
11815 }
11816 }
11817 else {
11818 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
11819 sc->tempLen = sprintf(sc->tempStr, " }");
11820 res = VkAppendLine(sc);
11821 if (res != VKFFT_SUCCESS) return res;
11822 }
11823 }
11824 }
11825 }
11826 }
11827 else {
11828 for (uint64_t k = 0; k < sc->registerBoost; k++) {
11829 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
11830 if (sc->localSize[1] == 1)
11831 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
11832 else
11833 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
11834 res = VkAppendLine(sc);
11835 if (res != VKFFT_SUCCESS) return res;
11836 if (sc->axisSwapped) {
11837 sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
11838 res = VkAppendLine(sc);
11839 if (res != VKFFT_SUCCESS) return res;
11840 }
11841 else {
11842 if (sc->localSize[1] == 1)
11843 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
11844 else
11845 sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
11846 res = VkAppendLine(sc);
11847 if (res != VKFFT_SUCCESS) return res;
11848 }
11849 if (sc->zeropad[1]) {
11850 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
11851 res = VkAppendLine(sc);
11852 if (res != VKFFT_SUCCESS) return res;
11853 }
11854 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11855 res = VkAppendLine(sc);
11856 if (res != VKFFT_SUCCESS) return res;
11857 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
11858 if (res != VKFFT_SUCCESS) return res;
11859 sc->tempLen = sprintf(sc->tempStr, ";\n");
11860 res = VkAppendLine(sc);
11861 if (res != VKFFT_SUCCESS) return res;
11863 if (res != VKFFT_SUCCESS) return res;
11864 if (sc->writeFromRegisters) {
11865 if (sc->outputBufferBlockNum == 1)
11866 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11867 else
11868 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11869 res = VkAppendLine(sc);
11870 if (res != VKFFT_SUCCESS) return res;
11871 }
11872 else {
11873 if (sc->axisSwapped) {
11874 if (sc->outputBufferBlockNum == 1)
11875 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight);
11876 else
11877 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight);
11878 res = VkAppendLine(sc);
11879 if (res != VKFFT_SUCCESS) return res;
11880 }
11881 else {
11882 if (sc->outputBufferBlockNum == 1)
11883 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight);
11884 else
11885 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight);
11886 res = VkAppendLine(sc);
11887 if (res != VKFFT_SUCCESS) return res;
11888 }
11889 }
11891 if (res != VKFFT_SUCCESS) return res;
11892 /*
11893 if (sc->outputBufferBlockNum == 1)
11894 if (sc->localSize[1] == 1)
11895 sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight);
11896 else
11897 sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight);
11898 else
11899 if (sc->localSize[1] == 1)
11900 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight);
11901 else
11902 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight);
11903 */
11904 if (sc->zeropad[1]) {
11905 sc->tempLen = sprintf(sc->tempStr, " }");
11906 res = VkAppendLine(sc);
11907 if (res != VKFFT_SUCCESS) return res;
11908 }
11909 }
11910 }
11911 }
11912 }
11913 else {
11914 if (sc->fftDim == sc->fft_dim_full) {
11915 for (uint64_t k = 0; k < sc->registerBoost; k++) {
11916 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
11917 if (sc->localSize[1] == 1)
11918 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
11919 else
11920 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
11921 res = VkAppendLine(sc);
11922 if (res != VKFFT_SUCCESS) return res;
11923
11924 if (sc->outputStride[0] > 1)
11925 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]);
11926 else
11927 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]);
11928 res = VkAppendLine(sc);
11929 if (res != VKFFT_SUCCESS) return res;
11930 if (sc->axisSwapped) {
11931 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
11932 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]);
11933 res = VkAppendLine(sc);
11934 if (res != VKFFT_SUCCESS) return res;
11935 }
11936 }
11937 else {
11938 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
11939 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]);
11940 res = VkAppendLine(sc);
11941 if (res != VKFFT_SUCCESS) return res;
11942 }
11943 }
11944 if (sc->zeropadBluestein[1]) {
11945 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
11946 res = VkAppendLine(sc);
11947 if (res != VKFFT_SUCCESS) return res;
11948 }
11949 if (sc->zeropad[1]) {
11950 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
11951 res = VkAppendLine(sc);
11952 if (res != VKFFT_SUCCESS) return res;
11953 }
11954 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
11955 res = VkAppendLine(sc);
11956 if (res != VKFFT_SUCCESS) return res;
11957 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
11958 if (res != VKFFT_SUCCESS) return res;
11959 sc->tempLen = sprintf(sc->tempStr, ";\n");
11960 res = VkAppendLine(sc);
11961 if (res != VKFFT_SUCCESS) return res;
11963 if (res != VKFFT_SUCCESS) return res;
11964 if (sc->writeFromRegisters) {
11965 if (sc->outputBufferBlockNum == 1)
11966 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11967 else
11968 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
11969 res = VkAppendLine(sc);
11970 if (res != VKFFT_SUCCESS) return res;
11971 }
11972 else {
11973 if (sc->axisSwapped) {
11974 if (sc->outputBufferBlockNum == 1)
11975 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11976 else
11977 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11978 res = VkAppendLine(sc);
11979 if (res != VKFFT_SUCCESS) return res;
11980 }
11981 else {
11982 if (sc->outputBufferBlockNum == 1)
11983 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11984 else
11985 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
11986 res = VkAppendLine(sc);
11987 if (res != VKFFT_SUCCESS) return res;
11988 }
11989 }
11991 if (res != VKFFT_SUCCESS) return res;
11992 if (sc->zeropad[1]) {
11993 sc->tempLen = sprintf(sc->tempStr, " }\n");
11994 res = VkAppendLine(sc);
11995 if (res != VKFFT_SUCCESS) return res;
11996 }
11997 if (sc->zeropadBluestein[1]) {
11998 sc->tempLen = sprintf(sc->tempStr, " }\n");
11999 res = VkAppendLine(sc);
12000 if (res != VKFFT_SUCCESS) return res;
12001 }
12002 if (sc->axisSwapped) {
12003 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
12004 sc->tempLen = sprintf(sc->tempStr, " }");
12005 res = VkAppendLine(sc);
12006 if (res != VKFFT_SUCCESS) return res;
12007 }
12008 }
12009 else {
12010 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
12011 sc->tempLen = sprintf(sc->tempStr, " }");
12012 res = VkAppendLine(sc);
12013 if (res != VKFFT_SUCCESS) return res;
12014 }
12015 }
12016 }
12017 }
12018 }
12019 else {
12020 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12021 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
12022 if (sc->localSize[1] == 1)
12023 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
12024 else {
12025 if (!sc->axisSwapped)
12026 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
12027 else
12028 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread));
12029 }
12030 res = VkAppendLine(sc);
12031 if (res != VKFFT_SUCCESS) return res;
12032 if (sc->axisSwapped) {
12033 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
12034 res = VkAppendLine(sc);
12035 if (res != VKFFT_SUCCESS) return res;
12036 }
12037 else {
12038 sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
12039 res = VkAppendLine(sc);
12040 if (res != VKFFT_SUCCESS) return res;
12041 }
12042 if (sc->zeropad[1]) {
12043 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
12044 res = VkAppendLine(sc);
12045 if (res != VKFFT_SUCCESS) return res;
12046 }
12047 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12048 res = VkAppendLine(sc);
12049 if (res != VKFFT_SUCCESS) return res;
12050 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
12051 if (res != VKFFT_SUCCESS) return res;
12052 sc->tempLen = sprintf(sc->tempStr, ";\n");
12053 res = VkAppendLine(sc);
12054 if (res != VKFFT_SUCCESS) return res;
12055 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
12057 if (res != VKFFT_SUCCESS) return res;
12058 if (sc->writeFromRegisters) {
12059 if (sc->outputBufferBlockNum == 1)
12060 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12061 else
12062 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12063 res = VkAppendLine(sc);
12064 if (res != VKFFT_SUCCESS) return res;
12065 }
12066 else {
12067 if (sc->axisSwapped) {
12068 if (sc->outputBufferBlockNum == 1)
12069 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight);
12070 else
12071 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight);
12072 res = VkAppendLine(sc);
12073 if (res != VKFFT_SUCCESS) return res;
12074 }
12075 else {
12076 if (sc->outputBufferBlockNum == 1)
12077 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight);
12078 else
12079 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight);
12080 res = VkAppendLine(sc);
12081 if (res != VKFFT_SUCCESS) return res;
12082 }
12083 }
12085 if (res != VKFFT_SUCCESS) return res;
12086 if (sc->zeropad[1]) {
12087 sc->tempLen = sprintf(sc->tempStr, " }\n");
12088 res = VkAppendLine(sc);
12089 if (res != VKFFT_SUCCESS) return res;
12090 }
12091 }
12092 }
12093 }
12094 }
12095 sc->tempLen = sprintf(sc->tempStr, " }\n");
12096 res = VkAppendLine(sc);
12097 if (res != VKFFT_SUCCESS) return res;
12098 break;
12099 }
12100 case 1: //grouped_c2c
12101 {
12102 if (!sc->writeFromRegisters) {
12103 res = appendBarrierVkFFT(sc, 1);
12104 if (res != VKFFT_SUCCESS) return res;
12105 }
12106 //res = appendZeropadStart(sc);
12107 //if (res != VKFFT_SUCCESS) return res;
12108 char shiftX[500] = "";
12109 if (sc->performWorkGroupShift[0])
12110 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
12111 sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
12112 res = VkAppendLine(sc);
12113 if (res != VKFFT_SUCCESS) return res;
12114 if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) {
12115 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12116 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
12117 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim));
12118 res = VkAppendLine(sc);
12119 if (res != VKFFT_SUCCESS) return res;
12120
12121 if (sc->zeropad[1]) {
12122 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
12123 res = VkAppendLine(sc);
12124 if (res != VKFFT_SUCCESS) return res;
12125 }
12126 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12127 res = VkAppendLine(sc);
12128 if (res != VKFFT_SUCCESS) return res;
12129 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
12130 res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch);
12131 if (res != VKFFT_SUCCESS) return res;
12132 sc->tempLen = sprintf(sc->tempStr, ";\n");
12133 res = VkAppendLine(sc);
12134 if (res != VKFFT_SUCCESS) return res;
12136 if (res != VKFFT_SUCCESS) return res;
12137 if (sc->writeFromRegisters) {
12138 if (sc->outputBufferBlockNum == 1)
12139 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12140 else
12141 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12142 res = VkAppendLine(sc);
12143 if (res != VKFFT_SUCCESS) return res;
12144 }
12145 else {
12146 if (sc->outputBufferBlockNum == 1)
12147 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12148 else
12149 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12150 res = VkAppendLine(sc);
12151 if (res != VKFFT_SUCCESS) return res;
12152
12153 }
12155 if (res != VKFFT_SUCCESS) return res;
12156 if (sc->zeropad[1]) {
12157 sc->tempLen = sprintf(sc->tempStr, " }\n");
12158 res = VkAppendLine(sc);
12159 if (res != VKFFT_SUCCESS) return res;
12160 }
12161
12162 }
12163 }
12164 }
12165 else {
12166 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12167 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
12168 if (sc->zeropadBluestein[1]) {
12169 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim);
12170 res = VkAppendLine(sc);
12171 if (res != VKFFT_SUCCESS) return res;
12172 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
12173 res = VkAppendLine(sc);
12174 if (res != VKFFT_SUCCESS) return res;
12175 }
12176 if (sc->zeropad[1]) {
12177 if (!sc->zeropadBluestein[1]) {
12178 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim);
12179 res = VkAppendLine(sc);
12180 if (res != VKFFT_SUCCESS) return res;
12181 }
12182 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
12183 res = VkAppendLine(sc);
12184 if (res != VKFFT_SUCCESS) return res;
12185 }
12186 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12187 res = VkAppendLine(sc);
12188 if (res != VKFFT_SUCCESS) return res;
12189 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
12190 sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim);
12191 res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
12192 if (res != VKFFT_SUCCESS) return res;
12193 sc->tempLen = sprintf(sc->tempStr, ";\n");
12194 res = VkAppendLine(sc);
12195 if (res != VKFFT_SUCCESS) return res;
12197 if (res != VKFFT_SUCCESS) return res;
12198 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch);
12199 if (sc->writeFromRegisters) {
12200 if (sc->outputBufferBlockNum == 1)
12201 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12202 else
12203 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12204 res = VkAppendLine(sc);
12205 if (res != VKFFT_SUCCESS) return res;
12206 }
12207 else {
12208 if (sc->outputBufferBlockNum == 1)
12209 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12210 else
12211 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12212 res = VkAppendLine(sc);
12213 if (res != VKFFT_SUCCESS) return res;
12214 }
12216 if (res != VKFFT_SUCCESS) return res;
12217 if (sc->zeropad[1]) {
12218 sc->tempLen = sprintf(sc->tempStr, " }\n");
12219 res = VkAppendLine(sc);
12220 if (res != VKFFT_SUCCESS) return res;
12221 }
12222 if (sc->zeropadBluestein[1]) {
12223 sc->tempLen = sprintf(sc->tempStr, " }\n");
12224 res = VkAppendLine(sc);
12225 if (res != VKFFT_SUCCESS) return res;
12226 }
12227 }
12228 }
12229 }
12230 sc->tempLen = sprintf(sc->tempStr, " }\n");
12231 res = VkAppendLine(sc);
12232 if (res != VKFFT_SUCCESS) return res;
12233 break;
12234
12235 }
12236 case 2: //single_c2c_strided
12237 {
12238 if (!sc->writeFromRegisters) {
12239 res = appendBarrierVkFFT(sc, 1);
12240 if (res != VKFFT_SUCCESS) return res;
12241 }
12242 //res = appendZeropadStart(sc);
12243 //if (res != VKFFT_SUCCESS) return res;
12244 char shiftX[500] = "";
12245 if (sc->performWorkGroupShift[0])
12246 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
12247 sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full);
12248 res = VkAppendLine(sc);
12249 if (res != VKFFT_SUCCESS) return res;
12250 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12251 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
12252 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim);
12253 res = VkAppendLine(sc);
12254 if (res != VKFFT_SUCCESS) return res;
12255 if (sc->zeropadBluestein[1]) {
12256 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
12257 res = VkAppendLine(sc);
12258 if (res != VKFFT_SUCCESS) return res;
12259 }
12260 if (sc->zeropad[1]) {
12261 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
12262 res = VkAppendLine(sc);
12263 if (res != VKFFT_SUCCESS) return res;
12264 }
12265 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12266 res = VkAppendLine(sc);
12267 if (res != VKFFT_SUCCESS) return res;
12268 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
12269 if (res != VKFFT_SUCCESS) return res;
12270 sc->tempLen = sprintf(sc->tempStr, ";\n");
12271 res = VkAppendLine(sc);
12272 if (res != VKFFT_SUCCESS) return res;
12274 if (res != VKFFT_SUCCESS) return res;
12275 if (sc->writeFromRegisters) {
12276 if (sc->outputBufferBlockNum == 1)
12277 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12278 else
12279 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12280 res = VkAppendLine(sc);
12281 if (res != VKFFT_SUCCESS) return res;
12282 }
12283 else {
12284 if (sc->outputBufferBlockNum == 1)
12285 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12286 else
12287 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
12288 res = VkAppendLine(sc);
12289 if (res != VKFFT_SUCCESS) return res;
12290 }
12292 if (res != VKFFT_SUCCESS) return res;
12293 if (sc->zeropad[1]) {
12294 sc->tempLen = sprintf(sc->tempStr, " }\n");
12295 res = VkAppendLine(sc);
12296 if (res != VKFFT_SUCCESS) return res;
12297 }
12298 if (sc->zeropadBluestein[1]) {
12299 sc->tempLen = sprintf(sc->tempStr, " }\n");
12300 res = VkAppendLine(sc);
12301 if (res != VKFFT_SUCCESS) return res;
12302 }
12303 }
12304 }
12305 sc->tempLen = sprintf(sc->tempStr, " }\n");
12306 res = VkAppendLine(sc);
12307 if (res != VKFFT_SUCCESS) return res;
12308 break;
12309
12310 }
12311 case 5://single_r2c
12312 {
12313 if (!sc->writeFromRegisters) {
12314 res = appendBarrierVkFFT(sc, 1);
12315 if (res != VKFFT_SUCCESS) return res;
12316 }
12317 //res = appendZeropadStart(sc);
12318 //if (res != VKFFT_SUCCESS) return res;
12319 char shiftX[500] = "";
12320 if (sc->performWorkGroupShift[0])
12321 sprintf(shiftX, " + consts.workGroupShiftX ");
12322 char shiftY[500] = "";
12323 if (sc->performWorkGroupShift[1])
12324 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
12325 if (sc->performWorkGroupShift[1])
12326 sprintf(shiftY, " + consts.workGroupShiftY ");
12327 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
12328 if (sc->reorderFourStep) {
12329 //Not implemented
12330 }
12331 else {
12332 //appendBarrierVkFFT(sc, 1);
12333 //appendZeropadStart(sc);
12334 if (sc->fftDim == sc->fft_dim_full) {
12336 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12337 if (sc->mergeSequencesR2C) {
12338 if (sc->axisSwapped) {
12339 sc->tempLen = sprintf(sc->tempStr, "\
12340 if (%s==0)\n\
12341 {\n\
12342 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
12344 res = VkAppendLine(sc);
12345 if (res != VKFFT_SUCCESS) return res;
12346 //res = appendZeropadEnd(sc);
12347 //if (res != VKFFT_SUCCESS) return res;
12348 res = appendBarrierVkFFT(sc, 1);
12349 if (res != VKFFT_SUCCESS) return res;
12350 //res = appendZeropadStart(sc);
12351 //if (res != VKFFT_SUCCESS) return res;
12352 }
12353 else {
12354 sc->tempLen = sprintf(sc->tempStr, "\
12355 if (%s==0)\n\
12356 {\n\
12357 sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\
12359 res = VkAppendLine(sc);
12360 if (res != VKFFT_SUCCESS) return res;
12361 //res = appendZeropadEnd(sc);
12362 //if (res != VKFFT_SUCCESS) return res;
12363 res = appendBarrierVkFFT(sc, 1);
12364 if (res != VKFFT_SUCCESS) return res;
12365 //res = appendZeropadStart(sc);
12366 //if (res != VKFFT_SUCCESS) return res;
12367 }
12368 }
12369 uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
12370 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
12371 for (uint64_t i = 0; i < num_out; i++) {
12372 if (sc->localSize[1] == 1)
12373 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]);
12374 else
12375 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]);
12376 res = VkAppendLine(sc);
12377 if (res != VKFFT_SUCCESS) return res;
12378
12379 if (!sc->axisSwapped) {
12380 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]);
12381 res = VkAppendLine(sc);
12382 if (res != VKFFT_SUCCESS) return res;
12383 }
12384 else {
12385 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]);
12386 res = VkAppendLine(sc);
12387 if (res != VKFFT_SUCCESS) return res;
12388 }
12389
12390 if (sc->axisSwapped) {
12391 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
12392 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
12393 res = VkAppendLine(sc);
12394 if (res != VKFFT_SUCCESS) return res;
12395 }
12396 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
12397 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]);
12398 res = VkAppendLine(sc);
12399 if (res != VKFFT_SUCCESS) return res;
12400 }
12401 }
12402 else {
12403 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
12404 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
12405 res = VkAppendLine(sc);
12406 if (res != VKFFT_SUCCESS) return res;
12407 }
12408 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) {
12409 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]);
12410 res = VkAppendLine(sc);
12411 if (res != VKFFT_SUCCESS) return res;
12412 }
12413 }
12414 if (sc->zeropad[1]) {
12415 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
12416 res = VkAppendLine(sc);
12417 if (res != VKFFT_SUCCESS) return res;
12418 }
12419 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12420 res = VkAppendLine(sc);
12421 if (res != VKFFT_SUCCESS) return res;
12422 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
12423 sc->tempLen = sprintf(sc->tempStr, ";\n");
12424 res = VkAppendLine(sc);
12425 if (res != VKFFT_SUCCESS) return res;
12427 if (res != VKFFT_SUCCESS) return res;
12428 if (sc->writeFromRegisters) {
12429 //not working yet
12430 if (sc->outputBufferBlockNum == 1)
12431 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12432 else
12433 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
12434 res = VkAppendLine(sc);
12435 if (res != VKFFT_SUCCESS) return res;
12436 }
12437 else {
12438 if (sc->mergeSequencesR2C) {
12439 if (sc->axisSwapped) {
12440 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim / 2 + 1);
12441 res = VkAppendLine(sc);
12442 if (res != VKFFT_SUCCESS) return res;
12443 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12444 res = VkAppendLine(sc);
12445 if (res != VKFFT_SUCCESS) return res;
12446 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12447 res = VkAppendLine(sc);
12448 if (res != VKFFT_SUCCESS) return res;
12449 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
12450 res = VkAppendLine(sc);
12451 if (res != VKFFT_SUCCESS) return res;
12452 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12453 res = VkAppendLine(sc);
12454 if (res != VKFFT_SUCCESS) return res;
12455 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12456 res = VkAppendLine(sc);
12457 if (res != VKFFT_SUCCESS) return res;
12458 sc->tempLen = sprintf(sc->tempStr, "}\n");
12459 res = VkAppendLine(sc);
12460 if (res != VKFFT_SUCCESS) return res;
12461 if (sc->outputBufferBlockNum == 1)
12462 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight);
12463 else
12464 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12465 res = VkAppendLine(sc);
12466 if (res != VKFFT_SUCCESS) return res;
12467 }
12468 else {
12469 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim / 2 + 1);
12470 res = VkAppendLine(sc);
12471 if (res != VKFFT_SUCCESS) return res;
12472 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12473 res = VkAppendLine(sc);
12474 if (res != VKFFT_SUCCESS) return res;
12475 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12476 res = VkAppendLine(sc);
12477 if (res != VKFFT_SUCCESS) return res;
12478 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
12479 res = VkAppendLine(sc);
12480 if (res != VKFFT_SUCCESS) return res;
12481 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12482 res = VkAppendLine(sc);
12483 if (res != VKFFT_SUCCESS) return res;
12484 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1));
12485 res = VkAppendLine(sc);
12486 if (res != VKFFT_SUCCESS) return res;
12487 sc->tempLen = sprintf(sc->tempStr, "}\n");
12488 res = VkAppendLine(sc);
12489 if (res != VKFFT_SUCCESS) return res;
12490 if (sc->outputBufferBlockNum == 1)
12491 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight);
12492 else
12493 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12494 res = VkAppendLine(sc);
12495 if (res != VKFFT_SUCCESS) return res;
12496 }
12497 }
12498 else {
12499 if (!sc->axisSwapped) {
12500 if (sc->outputBufferBlockNum == 1)
12501 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight);
12502 else
12503 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight);
12504 res = VkAppendLine(sc);
12505 if (res != VKFFT_SUCCESS) return res;
12506 }
12507 else {
12508 if (sc->outputBufferBlockNum == 1)
12509 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight);
12510 else
12511 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight);
12512 res = VkAppendLine(sc);
12513 if (res != VKFFT_SUCCESS) return res;
12514 }
12515 }
12516 }
12518 if (res != VKFFT_SUCCESS) return res;
12519 if (sc->zeropad[1]) {
12520 sc->tempLen = sprintf(sc->tempStr, " }\n");
12521 res = VkAppendLine(sc);
12522 if (res != VKFFT_SUCCESS) return res;
12523 }
12524 if (sc->axisSwapped) {
12525 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
12526 sc->tempLen = sprintf(sc->tempStr, " }\n");
12527 res = VkAppendLine(sc);
12528 if (res != VKFFT_SUCCESS) return res;
12529 }
12530 }
12531 else {
12532 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[1])
12533 {
12534 sc->tempLen = sprintf(sc->tempStr, " }\n");
12535 res = VkAppendLine(sc);
12536 if (res != VKFFT_SUCCESS) return res;
12537 }
12538 }
12539 if (sc->axisSwapped) {
12540 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
12541 sc->tempLen = sprintf(sc->tempStr, " }\n");
12542 res = VkAppendLine(sc);
12543 if (res != VKFFT_SUCCESS) return res;
12544 }
12545 }
12546 else {
12547 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
12548 sc->tempLen = sprintf(sc->tempStr, " }\n");
12549 res = VkAppendLine(sc);
12550 if (res != VKFFT_SUCCESS) return res;
12551 }
12552 }
12553 }
12554 }
12555 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
12556 }
12557 else {
12558
12559 }
12560 /*sc->tempLen = sprintf(sc->tempStr, "\
12561if (%s==%" PRIu64 ") \n\
12562{\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - 1);
12563 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12564 sprintf(index_x, "%" PRIu64 "", sc->fftDim / 2);
12565 sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
12566 indexInputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
12567 sc->tempLen = sprintf(sc->tempStr, ";\n");
12568 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(2 * (%s%s), %" PRIu64 ");\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2));
12569 if (sc->outputBufferBlockNum == 1)
12570 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", outputsStruct, convTypeLeft,sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight);
12571 else
12572 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight);
12573
12574 VkAppendLine(sc, " }\n");*/
12575 }
12576 break;
12577 }
12578 case 6: //single_c2r
12579 {
12580 char shiftY[500] = "";
12581 if (sc->performWorkGroupShift[1])
12582 sprintf(shiftY, " + consts.workGroupShiftY * %" PRIu64 "", sc->localSize[1]);
12583
12584 if (!sc->writeFromRegisters) {
12585 res = appendBarrierVkFFT(sc, 1);
12586 if (res != VKFFT_SUCCESS) return res;
12587 }
12588 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
12589 //res = appendZeropadStart(sc);
12590 //if (res != VKFFT_SUCCESS) return res;
12591 if (sc->reorderFourStep) {
12592 //Not implemented
12593 }
12594 else {
12595 if (sc->fftDim == sc->fft_dim_full) {
12596 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12597 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
12598 if (sc->localSize[1] == 1)
12599 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
12600 else
12601 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
12602 res = VkAppendLine(sc);
12603 if (res != VKFFT_SUCCESS) return res;
12604
12605 if (sc->outputStride[0] > 1)
12606 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, mult * sc->outputStride[1]);
12607 else
12608 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->outputStride[1]);
12609 res = VkAppendLine(sc);
12610 if (res != VKFFT_SUCCESS) return res;
12611
12612 if (sc->axisSwapped) {
12613 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
12614 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
12615 res = VkAppendLine(sc);
12616 if (res != VKFFT_SUCCESS) return res;
12617 }
12618 }
12619 else {
12620 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
12621 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
12622 res = VkAppendLine(sc);
12623 if (res != VKFFT_SUCCESS) return res;
12624 }
12625 }
12626 if (sc->zeropadBluestein[1]) {
12627 sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
12628 res = VkAppendLine(sc);
12629 if (res != VKFFT_SUCCESS) return res;
12630 }
12631 if (sc->zeropad[1]) {
12632 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
12633 res = VkAppendLine(sc);
12634 if (res != VKFFT_SUCCESS) return res;
12635 }
12637 if (res != VKFFT_SUCCESS) return res;
12638 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12639 res = VkAppendLine(sc);
12640 if (res != VKFFT_SUCCESS) return res;
12641 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
12642 if (res != VKFFT_SUCCESS) return res;
12643 sc->tempLen = sprintf(sc->tempStr, ";\n");
12644 res = VkAppendLine(sc);
12645 if (res != VKFFT_SUCCESS) return res;
12646
12647 if (sc->writeFromRegisters) {
12648 if (sc->outputBufferBlockNum == 1)
12649 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight);
12650 else
12651 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight);
12652 res = VkAppendLine(sc);
12653 if (res != VKFFT_SUCCESS) return res;
12654 if (sc->mergeSequencesR2C) {
12655 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]);
12656 res = VkAppendLine(sc);
12657 if (res != VKFFT_SUCCESS) return res;
12658
12659 if (sc->outputBufferBlockNum == 1)
12660 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight);
12661 else
12662 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight);
12663 res = VkAppendLine(sc);
12664 if (res != VKFFT_SUCCESS) return res;
12665 }
12666 }
12667 else {
12668 if (sc->axisSwapped) {
12669 if (sc->outputBufferBlockNum == 1)
12670 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12671 else
12672 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12673 res = VkAppendLine(sc);
12674 if (res != VKFFT_SUCCESS) return res;
12675 if (sc->mergeSequencesR2C) {
12676 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]);
12677 res = VkAppendLine(sc);
12678 if (res != VKFFT_SUCCESS) return res;
12679 if (sc->outputBufferBlockNum == 1)
12680 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12681 else
12682 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12683 res = VkAppendLine(sc);
12684 if (res != VKFFT_SUCCESS) return res;
12685 }
12686 }
12687 else {
12688 if (sc->outputBufferBlockNum == 1)
12689 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12690 else
12691 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12692 res = VkAppendLine(sc);
12693 if (res != VKFFT_SUCCESS) return res;
12694 if (sc->mergeSequencesR2C) {
12695 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]);
12696 res = VkAppendLine(sc);
12697 if (res != VKFFT_SUCCESS) return res;
12698 if (sc->outputBufferBlockNum == 1)
12699 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12700 else
12701 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
12702 res = VkAppendLine(sc);
12703 if (res != VKFFT_SUCCESS) return res;
12704 }
12705 }
12706 }
12708 if (res != VKFFT_SUCCESS) return res;
12709 if (sc->zeropad[1]) {
12710 sc->tempLen = sprintf(sc->tempStr, " }\n");
12711 res = VkAppendLine(sc);
12712 if (res != VKFFT_SUCCESS) return res;
12713 }
12714 if (sc->zeropadBluestein[1]) {
12715 sc->tempLen = sprintf(sc->tempStr, " }\n");
12716 res = VkAppendLine(sc);
12717 if (res != VKFFT_SUCCESS) return res;
12718 }
12719 if (sc->axisSwapped) {
12720 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
12721 sc->tempLen = sprintf(sc->tempStr, " }");
12722 res = VkAppendLine(sc);
12723 if (res != VKFFT_SUCCESS) return res;
12724 }
12725 }
12726 else {
12727 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
12728 sc->tempLen = sprintf(sc->tempStr, " }");
12729 res = VkAppendLine(sc);
12730 if (res != VKFFT_SUCCESS) return res;
12731 }
12732 }
12733 }
12734 }
12735 }
12736 else {
12737
12738 }
12739 }
12740
12741 break;
12742 }
12743 case 110://DCT-I nonstrided
12744 {
12745 if (!sc->writeFromRegisters) {
12746 res = appendBarrierVkFFT(sc, 1);
12747 if (res != VKFFT_SUCCESS) return res;
12748 }
12749 //res = appendZeropadStart(sc);
12750 //if (res != VKFFT_SUCCESS) return res;
12751 char shiftX[500] = "";
12752 if (sc->performWorkGroupShift[0])
12753 sprintf(shiftX, " + consts.workGroupShiftX ");
12754 char shiftY[500] = "";
12755 if (sc->performWorkGroupShift[1])
12756 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
12757 char shiftY2[500] = "";
12758 if (sc->performWorkGroupShift[1])
12759 sprintf(shiftY2, " + consts.workGroupShiftY ");
12760 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
12761 if (sc->reorderFourStep) {
12762 //Not implemented
12763 }
12764 else {
12765 //appendBarrierVkFFT(sc, 1);
12766 //appendZeropadStart(sc);
12767 if (sc->fftDim == sc->fft_dim_full) {
12769 sc->fftDim = (sc->fftDim + 2) / 2;
12770 for (uint64_t k = 0; k < sc->registerBoost; k++) {
12771 if (sc->mergeSequencesR2C) {
12772 if (sc->axisSwapped) {
12773 sc->tempLen = sprintf(sc->tempStr, "\
12774 if (%s==0)\n\
12775 {\n\
12776 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
12778 res = VkAppendLine(sc);
12779 if (res != VKFFT_SUCCESS) return res;
12780 //res = appendZeropadEnd(sc);
12781 //if (res != VKFFT_SUCCESS) return res;
12782 res = appendBarrierVkFFT(sc, 1);
12783 if (res != VKFFT_SUCCESS) return res;
12784 //res = appendZeropadStart(sc);
12785 //if (res != VKFFT_SUCCESS) return res;
12786 }
12787 else {
12788 sc->tempLen = sprintf(sc->tempStr, "\
12789 if (%s==0)\n\
12790 {\n\
12791 sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\
12793 res = VkAppendLine(sc);
12794 if (res != VKFFT_SUCCESS) return res;
12795 //res = appendZeropadEnd(sc);
12796 //if (res != VKFFT_SUCCESS) return res;
12797 res = appendBarrierVkFFT(sc, 1);
12798 if (res != VKFFT_SUCCESS) return res;
12799 //res = appendZeropadStart(sc);
12800 //if (res != VKFFT_SUCCESS) return res;
12801 }
12802 }
12803 uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]);
12804 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
12805 for (uint64_t i = 0; i < num_out; i++) {
12806 if (sc->localSize[1] == 1)
12807 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]);
12808 else
12809 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]);
12810 res = VkAppendLine(sc);
12811 if (res != VKFFT_SUCCESS) return res;
12812
12813 if (!sc->axisSwapped) {
12814 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]);
12815 res = VkAppendLine(sc);
12816 if (res != VKFFT_SUCCESS) return res;
12817 }
12818 else {
12819 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]);
12820 res = VkAppendLine(sc);
12821 if (res != VKFFT_SUCCESS) return res;
12822 }
12823 if (sc->axisSwapped) {
12824 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
12825 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
12826 res = VkAppendLine(sc);
12827 if (res != VKFFT_SUCCESS) return res;
12828 }
12829 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
12830 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
12831 res = VkAppendLine(sc);
12832 if (res != VKFFT_SUCCESS) return res;
12833 }
12834 }
12835 else {
12836 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
12837 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
12838 res = VkAppendLine(sc);
12839 if (res != VKFFT_SUCCESS) return res;
12840 }
12841 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[1]) {
12842 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]);
12843 res = VkAppendLine(sc);
12844 if (res != VKFFT_SUCCESS) return res;
12845 }
12846 }
12847 if (sc->zeropad[1]) {
12848 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
12849 res = VkAppendLine(sc);
12850 if (res != VKFFT_SUCCESS) return res;
12851 }
12852 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
12853 res = VkAppendLine(sc);
12854 if (res != VKFFT_SUCCESS) return res;
12855 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
12856 sc->tempLen = sprintf(sc->tempStr, ";\n");
12857 res = VkAppendLine(sc);
12858 if (res != VKFFT_SUCCESS) return res;
12860 if (res != VKFFT_SUCCESS) return res;
12861 if (sc->mergeSequencesR2C) {
12862 if (sc->axisSwapped) {
12863
12864 sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim);
12865 res = VkAppendLine(sc);
12866 if (res != VKFFT_SUCCESS) return res;
12867 if (sc->outputBufferBlockNum == 1)
12868 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight);
12869 else
12870 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12871 res = VkAppendLine(sc);
12872 if (res != VKFFT_SUCCESS) return res;
12873
12874 if (sc->outputBufferBlockNum == 1)
12875 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight);
12876 else
12877 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12878 res = VkAppendLine(sc);
12879 if (res != VKFFT_SUCCESS) return res;
12880 if (sc->zeropad[1]) {
12881 sc->tempLen = sprintf(sc->tempStr, " }\n");
12882 res = VkAppendLine(sc);
12883 if (res != VKFFT_SUCCESS) return res;
12884 }
12885 }
12886 else {
12887 sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim);
12888 res = VkAppendLine(sc);
12889 if (res != VKFFT_SUCCESS) return res;
12890 if (sc->outputBufferBlockNum == 1)
12891 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight);
12892 else
12893 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12894 res = VkAppendLine(sc);
12895 if (res != VKFFT_SUCCESS) return res;
12896 if (sc->outputBufferBlockNum == 1)
12897 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight);
12898 else
12899 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight);
12900 res = VkAppendLine(sc);
12901 if (res != VKFFT_SUCCESS) return res;
12902 if (sc->zeropad[1]) {
12903 sc->tempLen = sprintf(sc->tempStr, " }\n");
12904 res = VkAppendLine(sc);
12905 if (res != VKFFT_SUCCESS) return res;
12906 }
12907 }
12908 }
12909 else {
12910 if (!sc->axisSwapped) {
12911 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
12912 res = VkAppendLine(sc);
12913 if (res != VKFFT_SUCCESS) return res;
12914 if (sc->outputBufferBlockNum == 1)
12915 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12916 else
12917 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
12918 res = VkAppendLine(sc);
12919 if (res != VKFFT_SUCCESS) return res;
12920 if (sc->zeropad[1]) {
12921 sc->tempLen = sprintf(sc->tempStr, " }\n");
12922 res = VkAppendLine(sc);
12923 if (res != VKFFT_SUCCESS) return res;
12924 }
12925 }
12926 else {
12927 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
12928 res = VkAppendLine(sc);
12929 if (res != VKFFT_SUCCESS) return res;
12930 if (sc->outputBufferBlockNum == 1)
12931 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12932 else
12933 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
12934 res = VkAppendLine(sc);
12935 if (res != VKFFT_SUCCESS) return res;
12936 if (sc->zeropad[1]) {
12937 sc->tempLen = sprintf(sc->tempStr, " }\n");
12938 res = VkAppendLine(sc);
12939 if (res != VKFFT_SUCCESS) return res;
12940 }
12941 }
12942 }
12944 if (res != VKFFT_SUCCESS) return res;
12945
12946 if (sc->axisSwapped) {
12947 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
12948 sc->tempLen = sprintf(sc->tempStr, " }\n");
12949 res = VkAppendLine(sc);
12950 if (res != VKFFT_SUCCESS) return res;
12951 }
12952 }
12953 else {
12954 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[1])
12955 {
12956 sc->tempLen = sprintf(sc->tempStr, " }\n");
12957 res = VkAppendLine(sc);
12958 if (res != VKFFT_SUCCESS) return res;
12959 }
12960 }
12961 if (sc->axisSwapped) {
12962 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
12963 sc->tempLen = sprintf(sc->tempStr, " }\n");
12964 res = VkAppendLine(sc);
12965 if (res != VKFFT_SUCCESS) return res;
12966 }
12967 }
12968 else {
12969 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
12970 sc->tempLen = sprintf(sc->tempStr, " }\n");
12971 res = VkAppendLine(sc);
12972 if (res != VKFFT_SUCCESS) return res;
12973 }
12974 }
12975 }
12976 }
12977 sc->fftDim = 2*sc->fftDim -2;
12978 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
12979 }
12980 else {
12981
12982 }
12983 }
12984 break;
12985 }
12986 case 111://DCT-II strided
12987 {
12988 if (!sc->writeFromRegisters) {
12989 res = appendBarrierVkFFT(sc, 1);
12990 if (res != VKFFT_SUCCESS) return res;
12991 }
12992 //res = appendZeropadStart(sc);
12993 //if (res != VKFFT_SUCCESS) return res;
12994 char shiftX[500] = "";
12995 if (sc->performWorkGroupShift[0])
12996 sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x);
12997 char shiftY[500] = "";
12998 if (sc->performWorkGroupShift[1])
12999 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
13000 char shiftY2[500] = "";
13001 if (sc->performWorkGroupShift[1])
13002 sprintf(shiftY2, " + consts.workGroupShiftY ");
13003 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
13004 if (sc->reorderFourStep) {
13005 //Not implemented
13006 }
13007 else {
13008 //appendBarrierVkFFT(sc, 1);
13009 //appendZeropadStart(sc);
13010 if (sc->fftDim == sc->fft_dim_full) {
13012 sc->fftDim = (sc->fftDim + 2)/2;
13013 for (uint64_t k = 0; k < sc->registerBoost; k++) {
13014 if (sc->mergeSequencesR2C) {
13015 sc->tempLen = sprintf(sc->tempStr, "\
13016 if (%s==0)\n\
13017 {\n\
13018 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
13020 res = VkAppendLine(sc);
13021 if (res != VKFFT_SUCCESS) return res;
13022 //res = appendZeropadEnd(sc);
13023 //if (res != VKFFT_SUCCESS) return res;
13024 res = appendBarrierVkFFT(sc, 1);
13025 if (res != VKFFT_SUCCESS) return res;
13026 //res = appendZeropadStart(sc);
13027 //if (res != VKFFT_SUCCESS) return res;
13028 }
13029 uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim) / (double)sc->localSize[1]);
13030 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
13031 for (uint64_t i = 0; i < num_out; i++) {
13032 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]);
13033 res = VkAppendLine(sc);
13034 if (res != VKFFT_SUCCESS) return res;
13035
13036 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]);
13037 res = VkAppendLine(sc);
13038 if (res != VKFFT_SUCCESS) return res;
13039 if (sc->size[0] % sc->localSize[0] != 0) {
13040 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]);
13041 res = VkAppendLine(sc);
13042 if (res != VKFFT_SUCCESS) return res;
13043 }
13044 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
13045 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
13046 res = VkAppendLine(sc);
13047 if (res != VKFFT_SUCCESS) return res;
13048 }
13049 if (sc->zeropad[1]) {
13050 sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
13051 res = VkAppendLine(sc);
13052 if (res != VKFFT_SUCCESS) return res;
13053 }
13054 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13055 res = VkAppendLine(sc);
13056 if (res != VKFFT_SUCCESS) return res;
13057 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13058 sc->tempLen = sprintf(sc->tempStr, ";\n");
13059 res = VkAppendLine(sc);
13060 if (res != VKFFT_SUCCESS) return res;
13062 if (res != VKFFT_SUCCESS) return res;
13063
13064 if (sc->mergeSequencesR2C) {
13065 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13066 res = VkAppendLine(sc);
13067 if (res != VKFFT_SUCCESS) return res;
13068 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13069 res = VkAppendLine(sc);
13070 if (res != VKFFT_SUCCESS) return res;
13071 if (sc->outputBufferBlockNum == 1)
13072 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13073 else
13074 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13075 res = VkAppendLine(sc);
13076 if (res != VKFFT_SUCCESS) return res;
13077
13078
13079 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13080 res = VkAppendLine(sc);
13081 if (res != VKFFT_SUCCESS) return res;
13082 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13083 res = VkAppendLine(sc);
13084 if (res != VKFFT_SUCCESS) return res;
13085
13086 if (sc->outputBufferBlockNum == 1)
13087 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13088 else
13089 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13090 res = VkAppendLine(sc);
13091 if (res != VKFFT_SUCCESS) return res;
13092 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]);
13093 res = VkAppendLine(sc);
13094 if (res != VKFFT_SUCCESS) return res;
13095 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]);
13096 res = VkAppendLine(sc);
13097 if (res != VKFFT_SUCCESS) return res;
13098
13099 if (sc->outputBufferBlockNum == 1)
13100 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13101 else
13102 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13103 res = VkAppendLine(sc);
13104 if (res != VKFFT_SUCCESS) return res;
13105 if (sc->outputBufferBlockNum == 1)
13106 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13107 else
13108 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13109 res = VkAppendLine(sc);
13110 if (res != VKFFT_SUCCESS) return res;
13111 sc->tempLen = sprintf(sc->tempStr, " }\n");
13112 res = VkAppendLine(sc);
13113 if (res != VKFFT_SUCCESS) return res;
13114 }
13115 else {
13116 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]);
13117 res = VkAppendLine(sc);
13118 if (res != VKFFT_SUCCESS) return res;
13119 if (sc->outputBufferBlockNum == 1)
13120 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13121 else
13122 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13123 res = VkAppendLine(sc);
13124 if (res != VKFFT_SUCCESS) return res;
13125 if (sc->zeropad[1]) {
13126 sc->tempLen = sprintf(sc->tempStr, " }\n");
13127 res = VkAppendLine(sc);
13128 if (res != VKFFT_SUCCESS) return res;
13129 }
13130 }
13132 if (res != VKFFT_SUCCESS) return res;
13133 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
13134 sc->tempLen = sprintf(sc->tempStr, " }\n");
13135 res = VkAppendLine(sc);
13136 if (res != VKFFT_SUCCESS) return res;
13137 }
13138 if (sc->size[0] % sc->localSize[0] != 0) {
13139 sc->tempLen = sprintf(sc->tempStr, " }\n");
13140 res = VkAppendLine(sc);
13141 if (res != VKFFT_SUCCESS) return res;
13142 }
13143 }
13144 }
13145 sc->fftDim = 2 * sc->fftDim - 2;
13146 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
13147 }
13148 else {
13149
13150 }
13151 }
13152 break;
13153 }
13154 case 120://DCT-II nonstrided
13155 {
13156 if (!sc->writeFromRegisters) {
13157 res = appendBarrierVkFFT(sc, 1);
13158 if (res != VKFFT_SUCCESS) return res;
13159 }
13160 //res = appendZeropadStart(sc);
13161 //if (res != VKFFT_SUCCESS) return res;
13162 char shiftX[500] = "";
13163 if (sc->performWorkGroupShift[0])
13164 sprintf(shiftX, " + consts.workGroupShiftX ");
13165 char shiftY[500] = "";
13166 if (sc->performWorkGroupShift[1])
13167 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
13168 char shiftY2[500] = "";
13169 if (sc->performWorkGroupShift[1])
13170 sprintf(shiftY2, " + consts.workGroupShiftY ");
13171 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
13172 if (sc->reorderFourStep) {
13173 //Not implemented
13174 }
13175 else {
13176 //appendBarrierVkFFT(sc, 1);
13177 //appendZeropadStart(sc);
13178 if (sc->fftDim == sc->fft_dim_full) {
13180 for (uint64_t k = 0; k < sc->registerBoost; k++) {
13181 if (sc->mergeSequencesR2C) {
13182 if (sc->axisSwapped) {
13183 sc->tempLen = sprintf(sc->tempStr, "\
13184 if (%s==0)\n\
13185 {\n\
13186 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
13188 res = VkAppendLine(sc);
13189 if (res != VKFFT_SUCCESS) return res;
13190 //res = appendZeropadEnd(sc);
13191 //if (res != VKFFT_SUCCESS) return res;
13192 res = appendBarrierVkFFT(sc, 1);
13193 if (res != VKFFT_SUCCESS) return res;
13194 //res = appendZeropadStart(sc);
13195 //if (res != VKFFT_SUCCESS) return res;
13196 }
13197 else {
13198 sc->tempLen = sprintf(sc->tempStr, "\
13199 if (%s==0)\n\
13200 {\n\
13201 sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\
13203 res = VkAppendLine(sc);
13204 if (res != VKFFT_SUCCESS) return res;
13205 //res = appendZeropadEnd(sc);
13206 //if (res != VKFFT_SUCCESS) return res;
13207 res = appendBarrierVkFFT(sc, 1);
13208 if (res != VKFFT_SUCCESS) return res;
13209 //res = appendZeropadStart(sc);
13210 //if (res != VKFFT_SUCCESS) return res;
13211 }
13212 }
13213 uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
13214 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
13215 for (uint64_t i = 0; i < num_out; i++) {
13216 if (sc->localSize[1] == 1)
13217 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]);
13218 else
13219 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]);
13220 res = VkAppendLine(sc);
13221 if (res != VKFFT_SUCCESS) return res;
13222
13223 if (!sc->axisSwapped) {
13224 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]);
13225 res = VkAppendLine(sc);
13226 if (res != VKFFT_SUCCESS) return res;
13227 }
13228 else {
13229 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]);
13230 res = VkAppendLine(sc);
13231 if (res != VKFFT_SUCCESS) return res;
13232 }
13233 if (sc->axisSwapped) {
13234 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
13235 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
13236 res = VkAppendLine(sc);
13237 if (res != VKFFT_SUCCESS) return res;
13238 }
13239 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
13240 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
13241 res = VkAppendLine(sc);
13242 if (res != VKFFT_SUCCESS) return res;
13243 }
13244 }
13245 else {
13246 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
13247 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
13248 res = VkAppendLine(sc);
13249 if (res != VKFFT_SUCCESS) return res;
13250 }
13251 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1]) {
13252 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]);
13253 res = VkAppendLine(sc);
13254 if (res != VKFFT_SUCCESS) return res;
13255 }
13256 }
13257 if (sc->zeropad[1]) {
13258 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13259 res = VkAppendLine(sc);
13260 if (res != VKFFT_SUCCESS) return res;
13261 }
13262 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13263 res = VkAppendLine(sc);
13264 if (res != VKFFT_SUCCESS) return res;
13265 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13266 sc->tempLen = sprintf(sc->tempStr, ";\n");
13267 res = VkAppendLine(sc);
13268 if (res != VKFFT_SUCCESS) return res;
13270 if (res != VKFFT_SUCCESS) return res;
13271 if (sc->LUT) {
13272 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1);
13273 res = VkAppendLine(sc);
13274 if (res != VKFFT_SUCCESS) return res;
13275 sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n");
13276 res = VkAppendLine(sc);
13277 if (res != VKFFT_SUCCESS) return res;
13278 sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n");
13279 res = VkAppendLine(sc);
13280 if (res != VKFFT_SUCCESS) return res;
13281 }
13282 else {
13283 sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", cosDef, -double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
13284 res = VkAppendLine(sc);
13285 if (res != VKFFT_SUCCESS) return res;
13286 sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17f%s * (combinedID %% %" PRIu64 ") );\n", sinDef, -double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1);
13287 res = VkAppendLine(sc);
13288 if (res != VKFFT_SUCCESS) return res;
13289 }
13290 if (sc->mergeSequencesR2C) {
13291 if (sc->axisSwapped) {
13292
13293 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13294 res = VkAppendLine(sc);
13295 if (res != VKFFT_SUCCESS) return res;
13296 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13297 res = VkAppendLine(sc);
13298 if (res != VKFFT_SUCCESS) return res;
13299 if (sc->outputBufferBlockNum == 1)
13300 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13301 else
13302 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13303 res = VkAppendLine(sc);
13304 if (res != VKFFT_SUCCESS) return res;
13305
13306
13307 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13308 res = VkAppendLine(sc);
13309 if (res != VKFFT_SUCCESS) return res;
13310 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13311 res = VkAppendLine(sc);
13312 if (res != VKFFT_SUCCESS) return res;
13313
13314 if (sc->outputBufferBlockNum == 1)
13315 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13316 else
13317 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13318 res = VkAppendLine(sc);
13319 if (res != VKFFT_SUCCESS) return res;
13320 if (sc->zeropad[1]) {
13321 sc->tempLen = sprintf(sc->tempStr, " }\n");
13322 res = VkAppendLine(sc);
13323 if (res != VKFFT_SUCCESS) return res;
13324 }
13325 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
13326 res = VkAppendLine(sc);
13327 if (res != VKFFT_SUCCESS) return res;
13328 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]);
13329 res = VkAppendLine(sc);
13330 if (res != VKFFT_SUCCESS) return res;
13331 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13332 res = VkAppendLine(sc);
13333 if (res != VKFFT_SUCCESS) return res;
13334 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13335 sc->tempLen = sprintf(sc->tempStr, ";\n");
13336 res = VkAppendLine(sc);
13337 if (res != VKFFT_SUCCESS) return res;
13338 if (sc->zeropad[1]) {
13339 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13340 res = VkAppendLine(sc);
13341 if (res != VKFFT_SUCCESS) return res;
13342 }
13343 if (sc->outputBufferBlockNum == 1)
13344 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13345 else
13346 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13347 res = VkAppendLine(sc);
13348 if (res != VKFFT_SUCCESS) return res;
13349 if (sc->outputBufferBlockNum == 1)
13350 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13351 else
13352 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13353 res = VkAppendLine(sc);
13354 if (res != VKFFT_SUCCESS) return res;
13355 if (sc->zeropad[1]) {
13356 sc->tempLen = sprintf(sc->tempStr, " }\n");
13357 res = VkAppendLine(sc);
13358 if (res != VKFFT_SUCCESS) return res;
13359 }
13360 sc->tempLen = sprintf(sc->tempStr, " }\n");
13361 res = VkAppendLine(sc);
13362 if (res != VKFFT_SUCCESS) return res;
13363 }
13364 else {
13365 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13366 res = VkAppendLine(sc);
13367 if (res != VKFFT_SUCCESS) return res;
13368 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13369 res = VkAppendLine(sc);
13370 if (res != VKFFT_SUCCESS) return res;
13371 if (sc->outputBufferBlockNum == 1)
13372 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13373 else
13374 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13375 res = VkAppendLine(sc);
13376 if (res != VKFFT_SUCCESS) return res;
13377 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13378 res = VkAppendLine(sc);
13379 if (res != VKFFT_SUCCESS) return res;
13380 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1));
13381 res = VkAppendLine(sc);
13382 if (res != VKFFT_SUCCESS) return res;
13383
13384 if (sc->outputBufferBlockNum == 1)
13385 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13386 else
13387 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13388 res = VkAppendLine(sc);
13389 if (res != VKFFT_SUCCESS) return res;
13390 if (sc->zeropad[1]) {
13391 sc->tempLen = sprintf(sc->tempStr, " }\n");
13392 res = VkAppendLine(sc);
13393 if (res != VKFFT_SUCCESS) return res;
13394 }
13395 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
13396 res = VkAppendLine(sc);
13397 if (res != VKFFT_SUCCESS) return res;
13398 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, 2 * sc->outputStride[1]);
13399 res = VkAppendLine(sc);
13400 if (res != VKFFT_SUCCESS) return res;
13401 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13402 res = VkAppendLine(sc);
13403 if (res != VKFFT_SUCCESS) return res;
13404 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13405 sc->tempLen = sprintf(sc->tempStr, ";\n");
13406 res = VkAppendLine(sc);
13407 if (res != VKFFT_SUCCESS) return res;
13408 if (sc->zeropad[1]) {
13409 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13410 res = VkAppendLine(sc);
13411 if (res != VKFFT_SUCCESS) return res;
13412 }
13413 if (sc->outputBufferBlockNum == 1)
13414 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13415 else
13416 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13417 res = VkAppendLine(sc);
13418 if (res != VKFFT_SUCCESS) return res;
13419 if (sc->outputBufferBlockNum == 1)
13420 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13421 else
13422 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13423 res = VkAppendLine(sc);
13424 if (res != VKFFT_SUCCESS) return res;
13425 if (sc->zeropad[1]) {
13426 sc->tempLen = sprintf(sc->tempStr, " }\n");
13427 res = VkAppendLine(sc);
13428 if (res != VKFFT_SUCCESS) return res;
13429 }
13430 sc->tempLen = sprintf(sc->tempStr, " }\n");
13431 res = VkAppendLine(sc);
13432 if (res != VKFFT_SUCCESS) return res;
13433 }
13434 }
13435 else {
13436 if (!sc->axisSwapped) {
13437 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
13438 res = VkAppendLine(sc);
13439 if (res != VKFFT_SUCCESS) return res;
13440 if (sc->outputBufferBlockNum == 1)
13441 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13442 else
13443 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13444 res = VkAppendLine(sc);
13445 if (res != VKFFT_SUCCESS) return res;
13446 if (sc->zeropad[1]) {
13447 sc->tempLen = sprintf(sc->tempStr, " }\n");
13448 res = VkAppendLine(sc);
13449 if (res != VKFFT_SUCCESS) return res;
13450 }
13451 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
13452 res = VkAppendLine(sc);
13453 if (res != VKFFT_SUCCESS) return res;
13454 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]);
13455 res = VkAppendLine(sc);
13456 if (res != VKFFT_SUCCESS) return res;
13457 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13458 res = VkAppendLine(sc);
13459 if (res != VKFFT_SUCCESS) return res;
13460 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13461 sc->tempLen = sprintf(sc->tempStr, ";\n");
13462 res = VkAppendLine(sc);
13463 if (res != VKFFT_SUCCESS) return res;
13464 if (sc->zeropad[1]) {
13465 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13466 res = VkAppendLine(sc);
13467 if (res != VKFFT_SUCCESS) return res;
13468 }
13469 if (sc->outputBufferBlockNum == 1)
13470 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13471 else
13472 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13473 res = VkAppendLine(sc);
13474 if (res != VKFFT_SUCCESS) return res;
13475 if (sc->zeropad[1]) {
13476 sc->tempLen = sprintf(sc->tempStr, " }\n");
13477 res = VkAppendLine(sc);
13478 if (res != VKFFT_SUCCESS) return res;
13479 }
13480 sc->tempLen = sprintf(sc->tempStr, " }\n");
13481 res = VkAppendLine(sc);
13482 if (res != VKFFT_SUCCESS) return res;
13483 }
13484 else {
13485 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
13486 res = VkAppendLine(sc);
13487 if (res != VKFFT_SUCCESS) return res;
13488 if (sc->outputBufferBlockNum == 1)
13489 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13490 else
13491 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13492 res = VkAppendLine(sc);
13493 if (res != VKFFT_SUCCESS) return res;
13494 if (sc->zeropad[1]) {
13495 sc->tempLen = sprintf(sc->tempStr, " }\n");
13496 res = VkAppendLine(sc);
13497 if (res != VKFFT_SUCCESS) return res;
13498 }
13499 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
13500 res = VkAppendLine(sc);
13501 if (res != VKFFT_SUCCESS) return res;
13502 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]);
13503 res = VkAppendLine(sc);
13504 if (res != VKFFT_SUCCESS) return res;
13505 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13506 res = VkAppendLine(sc);
13507 if (res != VKFFT_SUCCESS) return res;
13508 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13509 sc->tempLen = sprintf(sc->tempStr, ";\n");
13510 res = VkAppendLine(sc);
13511 if (res != VKFFT_SUCCESS) return res;
13512 if (sc->zeropad[1]) {
13513 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13514 res = VkAppendLine(sc);
13515 if (res != VKFFT_SUCCESS) return res;
13516 }
13517 if (sc->outputBufferBlockNum == 1)
13518 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13519 else
13520 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13521 res = VkAppendLine(sc);
13522 if (res != VKFFT_SUCCESS) return res;
13523 if (sc->zeropad[1]) {
13524 sc->tempLen = sprintf(sc->tempStr, " }\n");
13525 res = VkAppendLine(sc);
13526 if (res != VKFFT_SUCCESS) return res;
13527 }
13528 sc->tempLen = sprintf(sc->tempStr, " }\n");
13529 res = VkAppendLine(sc);
13530 if (res != VKFFT_SUCCESS) return res;
13531 }
13532 }
13534 if (res != VKFFT_SUCCESS) return res;
13535
13536 if (sc->axisSwapped) {
13537 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
13538 sc->tempLen = sprintf(sc->tempStr, " }\n");
13539 res = VkAppendLine(sc);
13540 if (res != VKFFT_SUCCESS) return res;
13541 }
13542 }
13543 else {
13544 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[1])
13545 {
13546 sc->tempLen = sprintf(sc->tempStr, " }\n");
13547 res = VkAppendLine(sc);
13548 if (res != VKFFT_SUCCESS) return res;
13549 }
13550 }
13551 if (sc->axisSwapped) {
13552 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
13553 sc->tempLen = sprintf(sc->tempStr, " }\n");
13554 res = VkAppendLine(sc);
13555 if (res != VKFFT_SUCCESS) return res;
13556 }
13557 }
13558 else {
13559 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
13560 sc->tempLen = sprintf(sc->tempStr, " }\n");
13561 res = VkAppendLine(sc);
13562 if (res != VKFFT_SUCCESS) return res;
13563 }
13564 }
13565 }
13566 }
13567 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
13568 }
13569 else {
13570
13571 }
13572 }
13573 break;
13574 }
13575 case 121://DCT-II strided
13576 {
13577 if (!sc->writeFromRegisters) {
13578 res = appendBarrierVkFFT(sc, 1);
13579 if (res != VKFFT_SUCCESS) return res;
13580 }
13581 //res = appendZeropadStart(sc);
13582 //if (res != VKFFT_SUCCESS) return res;
13583 char shiftX[500] = "";
13584 if (sc->performWorkGroupShift[0])
13585 sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x);
13586 char shiftY[500] = "";
13587 if (sc->performWorkGroupShift[1])
13588 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
13589 char shiftY2[500] = "";
13590 if (sc->performWorkGroupShift[1])
13591 sprintf(shiftY2, " + consts.workGroupShiftY ");
13592 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
13593 if (sc->reorderFourStep) {
13594 //Not implemented
13595 }
13596 else {
13597 //appendBarrierVkFFT(sc, 1);
13598 //appendZeropadStart(sc);
13599 if (sc->fftDim == sc->fft_dim_full) {
13601 for (uint64_t k = 0; k < sc->registerBoost; k++) {
13602 if (sc->mergeSequencesR2C) {
13603 sc->tempLen = sprintf(sc->tempStr, "\
13604 if (%s==0)\n\
13605 {\n\
13606 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
13608 res = VkAppendLine(sc);
13609 if (res != VKFFT_SUCCESS) return res;
13610 //res = appendZeropadEnd(sc);
13611 //if (res != VKFFT_SUCCESS) return res;
13612 res = appendBarrierVkFFT(sc, 1);
13613 if (res != VKFFT_SUCCESS) return res;
13614 //res = appendZeropadStart(sc);
13615 //if (res != VKFFT_SUCCESS) return res;
13616 }
13617 uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]);
13618 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
13619 for (uint64_t i = 0; i < num_out; i++) {
13620 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]);
13621 res = VkAppendLine(sc);
13622 if (res != VKFFT_SUCCESS) return res;
13623
13624 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]);
13625 res = VkAppendLine(sc);
13626 if (res != VKFFT_SUCCESS) return res;
13627 if (sc->size[0] % sc->localSize[0] != 0) {
13628 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]);
13629 res = VkAppendLine(sc);
13630 if (res != VKFFT_SUCCESS) return res;
13631 }
13632 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
13633 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
13634 res = VkAppendLine(sc);
13635 if (res != VKFFT_SUCCESS) return res;
13636 }
13637 if (sc->zeropad[1]) {
13638 sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
13639 res = VkAppendLine(sc);
13640 if (res != VKFFT_SUCCESS) return res;
13641 }
13642 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13643 res = VkAppendLine(sc);
13644 if (res != VKFFT_SUCCESS) return res;
13645 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13646 sc->tempLen = sprintf(sc->tempStr, ";\n");
13647 res = VkAppendLine(sc);
13648 if (res != VKFFT_SUCCESS) return res;
13650 if (res != VKFFT_SUCCESS) return res;
13651 if (sc->LUT) {
13652 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]);
13653 res = VkAppendLine(sc);
13654 if (res != VKFFT_SUCCESS) return res;
13655 sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n");
13656 res = VkAppendLine(sc);
13657 if (res != VKFFT_SUCCESS) return res;
13658 sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n");
13659 res = VkAppendLine(sc);
13660 if (res != VKFFT_SUCCESS) return res;
13661 }
13662 else {
13663 sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17f%s * (combinedID / %" PRIu64 ") );\n", cosDef, -double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]);
13664 res = VkAppendLine(sc);
13665 if (res != VKFFT_SUCCESS) return res;
13666 sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17f%s * (combinedID / %" PRIu64 ") );\n", sinDef, -double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]);
13667 res = VkAppendLine(sc);
13668 if (res != VKFFT_SUCCESS) return res;
13669 }
13670
13671 if (sc->mergeSequencesR2C) {
13672 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13673 res = VkAppendLine(sc);
13674 if (res != VKFFT_SUCCESS) return res;
13675 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13676 res = VkAppendLine(sc);
13677 if (res != VKFFT_SUCCESS) return res;
13678 if (sc->outputBufferBlockNum == 1)
13679 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13680 else
13681 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13682 res = VkAppendLine(sc);
13683 if (res != VKFFT_SUCCESS) return res;
13684
13685
13686 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13687 res = VkAppendLine(sc);
13688 if (res != VKFFT_SUCCESS) return res;
13689 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
13690 res = VkAppendLine(sc);
13691 if (res != VKFFT_SUCCESS) return res;
13692
13693 if (sc->outputBufferBlockNum == 1)
13694 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13695 else
13696 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13697 res = VkAppendLine(sc);
13698 if (res != VKFFT_SUCCESS) return res;
13699 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]);
13700 res = VkAppendLine(sc);
13701 if (res != VKFFT_SUCCESS) return res;
13702 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]);
13703 res = VkAppendLine(sc);
13704 if (res != VKFFT_SUCCESS) return res;
13705
13706 if (sc->outputBufferBlockNum == 1)
13707 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13708 else
13709 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
13710 res = VkAppendLine(sc);
13711 if (res != VKFFT_SUCCESS) return res;
13712 if (sc->outputBufferBlockNum == 1)
13713 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13714 else
13715 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
13716 res = VkAppendLine(sc);
13717 if (res != VKFFT_SUCCESS) return res;
13718 sc->tempLen = sprintf(sc->tempStr, " }\n");
13719 res = VkAppendLine(sc);
13720 if (res != VKFFT_SUCCESS) return res;
13721 }
13722 else {
13723 sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]);
13724 res = VkAppendLine(sc);
13725 if (res != VKFFT_SUCCESS) return res;
13726 if (sc->outputBufferBlockNum == 1)
13727 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13728 else
13729 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13730 res = VkAppendLine(sc);
13731 if (res != VKFFT_SUCCESS) return res;
13732 if (sc->zeropad[1]) {
13733 sc->tempLen = sprintf(sc->tempStr, " }\n");
13734 res = VkAppendLine(sc);
13735 if (res != VKFFT_SUCCESS) return res;
13736 }
13737 sc->tempLen = sprintf(sc->tempStr, " if((combinedID/ %" PRIu64 ")> 0){\n", sc->localSize[0]);
13738 res = VkAppendLine(sc);
13739 if (res != VKFFT_SUCCESS) return res;
13740 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") * %" PRIu64 " + %s%s;\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->outputStride[1], sc->gl_GlobalInvocationID_x, shiftX);
13741 res = VkAppendLine(sc);
13742 if (res != VKFFT_SUCCESS) return res;
13743 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13744 res = VkAppendLine(sc);
13745 if (res != VKFFT_SUCCESS) return res;
13746 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13747 sc->tempLen = sprintf(sc->tempStr, ";\n");
13748 res = VkAppendLine(sc);
13749 if (res != VKFFT_SUCCESS) return res;
13750 if (sc->zeropad[1]) {
13751 sc->tempLen = sprintf(sc->tempStr, " if(( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
13752 res = VkAppendLine(sc);
13753 if (res != VKFFT_SUCCESS) return res;
13754 }
13755 if (sc->outputBufferBlockNum == 1)
13756 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13757 else
13758 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13759 res = VkAppendLine(sc);
13760 if (res != VKFFT_SUCCESS) return res;
13761 if (sc->zeropad[1]) {
13762 sc->tempLen = sprintf(sc->tempStr, " }\n");
13763 res = VkAppendLine(sc);
13764 if (res != VKFFT_SUCCESS) return res;
13765 }
13766 sc->tempLen = sprintf(sc->tempStr, " }\n");
13767 res = VkAppendLine(sc);
13768 if (res != VKFFT_SUCCESS) return res;
13769 }
13771 if (res != VKFFT_SUCCESS) return res;
13772 if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim / 2 + 1) * sc->localSize[0]) {
13773 sc->tempLen = sprintf(sc->tempStr, " }\n");
13774 res = VkAppendLine(sc);
13775 if (res != VKFFT_SUCCESS) return res;
13776 }
13777 if (sc->size[0] % sc->localSize[0] != 0) {
13778 sc->tempLen = sprintf(sc->tempStr, " }\n");
13779 res = VkAppendLine(sc);
13780 if (res != VKFFT_SUCCESS) return res;
13781 }
13782 }
13783 }
13784 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
13785 }
13786 else {
13787
13788 }
13789 }
13790 break;
13791 }
13792 case 130://DCT-III nonstrided
13793 {
13794 if (!sc->writeFromRegisters) {
13795 res = appendBarrierVkFFT(sc, 1);
13796 if (res != VKFFT_SUCCESS) return res;
13797 }
13798 //res = appendZeropadStart(sc);
13799 //if (res != VKFFT_SUCCESS) return res;
13800 char shiftX[500] = "";
13801 if (sc->performWorkGroupShift[0])
13802 sprintf(shiftX, " + consts.workGroupShiftX ");
13803 char shiftY[500] = "";
13804 if (sc->performWorkGroupShift[1])
13805 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
13806 char shiftY2[500] = "";
13807 if (sc->performWorkGroupShift[1])
13808 sprintf(shiftY2, " + consts.workGroupShiftY ");
13809 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
13810 if (sc->reorderFourStep) {
13811 //Not implemented
13812 }
13813 else {
13814 //appendBarrierVkFFT(sc, 1);
13815 //appendZeropadStart(sc);
13816 if (sc->fftDim == sc->fft_dim_full) {
13818 uint64_t maxBluesteinCutOff = 1;
13819 if (sc->zeropadBluestein[1]) {
13820 if (sc->axisSwapped)
13821 maxBluesteinCutOff = sc->fftDim * sc->localSize[0];
13822 else
13823 maxBluesteinCutOff = sc->fftDim * sc->localSize[1];
13824 }
13825 for (uint64_t k = 0; k < sc->registerBoost; k++) {
13826 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
13827 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
13828 if (sc->localSize[1] == 1)
13829 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
13830 else
13831 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
13832 res = VkAppendLine(sc);
13833 if (res != VKFFT_SUCCESS) return res;
13834 if (sc->zeropadBluestein[1]) {
13835 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
13836 res = VkAppendLine(sc);
13837 if (res != VKFFT_SUCCESS) return res;
13838 }
13839 if (!sc->axisSwapped) {
13840 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]);
13841 res = VkAppendLine(sc);
13842 if (res != VKFFT_SUCCESS) return res;
13843 }
13844 else {
13845 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]);
13846 res = VkAppendLine(sc);
13847 if (res != VKFFT_SUCCESS) return res;
13848 }
13849 if (sc->axisSwapped) {
13850 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
13851 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
13852 res = VkAppendLine(sc);
13853 if (res != VKFFT_SUCCESS) return res;
13854 }
13855
13856 }
13857 else {
13858 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
13859 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
13860 res = VkAppendLine(sc);
13861 if (res != VKFFT_SUCCESS) return res;
13862 }
13863
13864 }
13865 if (sc->zeropad[1]) {
13866 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
13867 res = VkAppendLine(sc);
13868 if (res != VKFFT_SUCCESS) return res;
13869 }
13870 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
13871 res = VkAppendLine(sc);
13872 if (res != VKFFT_SUCCESS) return res;
13873 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
13874 sc->tempLen = sprintf(sc->tempStr, ";\n");
13875 res = VkAppendLine(sc);
13876 if (res != VKFFT_SUCCESS) return res;
13878 if (res != VKFFT_SUCCESS) return res;
13879 if (sc->mergeSequencesR2C) {
13880 if (sc->axisSwapped) {
13881 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
13882 res = VkAppendLine(sc);
13883 if (res != VKFFT_SUCCESS) return res;
13884 }
13885 else {
13886 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
13887 res = VkAppendLine(sc);
13888 if (res != VKFFT_SUCCESS) return res;
13889 }
13890
13891 if (sc->outputBufferBlockNum == 1)
13892 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight);
13893 else
13894 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13895 res = VkAppendLine(sc);
13896 if (res != VKFFT_SUCCESS) return res;
13897 sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->inoutID, sc->outputStride[1]);
13898 res = VkAppendLine(sc);
13899 if (res != VKFFT_SUCCESS) return res;
13900 if (sc->outputBufferBlockNum == 1)
13901 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight);
13902 else
13903 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13904 res = VkAppendLine(sc);
13905 if (res != VKFFT_SUCCESS) return res;
13906 }
13907 else {
13908 if (!sc->axisSwapped) {
13909 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
13910 res = VkAppendLine(sc);
13911 if (res != VKFFT_SUCCESS) return res;
13912 if (sc->outputBufferBlockNum == 1)
13913 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13914 else
13915 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13916 res = VkAppendLine(sc);
13917 if (res != VKFFT_SUCCESS) return res;
13918 }
13919 else {
13920 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
13921 res = VkAppendLine(sc);
13922 if (res != VKFFT_SUCCESS) return res;
13923 if (sc->outputBufferBlockNum == 1)
13924 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13925 else
13926 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
13927 res = VkAppendLine(sc);
13928 if (res != VKFFT_SUCCESS) return res;
13929 }
13930 }
13932 if (res != VKFFT_SUCCESS) return res;
13933 if (sc->zeropad[1]) {
13934 sc->tempLen = sprintf(sc->tempStr, " }\n");
13935 res = VkAppendLine(sc);
13936 if (res != VKFFT_SUCCESS) return res;
13937 }
13938 if (sc->axisSwapped) {
13939 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
13940 sc->tempLen = sprintf(sc->tempStr, " }\n");
13941 res = VkAppendLine(sc);
13942 if (res != VKFFT_SUCCESS) return res;
13943 }
13944 }
13945 else {
13946 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
13947 sc->tempLen = sprintf(sc->tempStr, " }\n");
13948 res = VkAppendLine(sc);
13949 if (res != VKFFT_SUCCESS) return res;
13950 }
13951 }
13952 if (sc->zeropadBluestein[1]) {
13953 sc->tempLen = sprintf(sc->tempStr, " }\n");
13954 res = VkAppendLine(sc);
13955 if (res != VKFFT_SUCCESS) return res;
13956 }
13957 }
13958 }
13959 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
13960
13961 }
13962 else {
13963
13964 }
13965 }
13966 break;
13967 }
13968 case 131://DCT-III strided
13969 {
13970 if (!sc->writeFromRegisters) {
13971 res = appendBarrierVkFFT(sc, 1);
13972 if (res != VKFFT_SUCCESS) return res;
13973 }
13974 //res = appendZeropadStart(sc);
13975 //if (res != VKFFT_SUCCESS) return res;
13976 char shiftX[500] = "";
13977 if (sc->performWorkGroupShift[0])
13978 sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x);
13979 char shiftY[500] = "";
13980 if (sc->performWorkGroupShift[1])
13981 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
13982 char shiftY2[500] = "";
13983 if (sc->performWorkGroupShift[1])
13984 sprintf(shiftY2, " + consts.workGroupShiftY ");
13985 //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
13986 if (sc->reorderFourStep) {
13987 //Not implemented
13988 }
13989 else {
13990 //appendBarrierVkFFT(sc, 1);
13991 //appendZeropadStart(sc);
13992 if (sc->fftDim == sc->fft_dim_full) {
13994 for (uint64_t k = 0; k < sc->registerBoost; k++) {
13995 if (sc->mergeSequencesR2C) {
13996 sc->tempLen = sprintf(sc->tempStr, "\
13997 if (%s==0)\n\
13998 {\n\
13999 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
14001 res = VkAppendLine(sc);
14002 if (res != VKFFT_SUCCESS) return res;
14003 //res = appendZeropadEnd(sc);
14004 //if (res != VKFFT_SUCCESS) return res;
14005 res = appendBarrierVkFFT(sc, 1);
14006 if (res != VKFFT_SUCCESS) return res;
14007 //res = appendZeropadStart(sc);
14008 //if (res != VKFFT_SUCCESS) return res;
14009 }
14010 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
14011 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14012 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14013 res = VkAppendLine(sc);
14014 if (res != VKFFT_SUCCESS) return res;
14015 if (sc->zeropadBluestein[1]) {
14016 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
14017 res = VkAppendLine(sc);
14018 if (res != VKFFT_SUCCESS) return res;
14019 }
14020 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]);
14021 res = VkAppendLine(sc);
14022 if (res != VKFFT_SUCCESS) return res;
14023 if (sc->size[0] % sc->localSize[0] != 0) {
14024 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]);
14025 res = VkAppendLine(sc);
14026 if (res != VKFFT_SUCCESS) return res;
14027 }
14028 /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
14029 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]);
14030 res = VkAppendLine(sc);
14031 if (res != VKFFT_SUCCESS) return res;
14032 }*/
14033 if (sc->zeropad[1]) {
14034 sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
14035 res = VkAppendLine(sc);
14036 if (res != VKFFT_SUCCESS) return res;
14037 }
14038 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14039 res = VkAppendLine(sc);
14040 if (res != VKFFT_SUCCESS) return res;
14041 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
14042 sc->tempLen = sprintf(sc->tempStr, ";\n");
14043 res = VkAppendLine(sc);
14044 if (res != VKFFT_SUCCESS) return res;
14046 if (res != VKFFT_SUCCESS) return res;
14047 if (sc->mergeSequencesR2C) {
14048 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
14049 res = VkAppendLine(sc);
14050 if (res != VKFFT_SUCCESS) return res;
14051 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
14052 res = VkAppendLine(sc);
14053 if (res != VKFFT_SUCCESS) return res;
14054 if (sc->outputBufferBlockNum == 1)
14055 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
14056 else
14057 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
14058 res = VkAppendLine(sc);
14059 if (res != VKFFT_SUCCESS) return res;
14060
14061
14062 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
14063 res = VkAppendLine(sc);
14064 if (res != VKFFT_SUCCESS) return res;
14065 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]);
14066 res = VkAppendLine(sc);
14067 if (res != VKFFT_SUCCESS) return res;
14068
14069 if (sc->outputBufferBlockNum == 1)
14070 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
14071 else
14072 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
14073 res = VkAppendLine(sc);
14074 if (res != VKFFT_SUCCESS) return res;
14075 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]);
14076 res = VkAppendLine(sc);
14077 if (res != VKFFT_SUCCESS) return res;
14078 sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]);
14079 res = VkAppendLine(sc);
14080 if (res != VKFFT_SUCCESS) return res;
14081
14082 if (sc->outputBufferBlockNum == 1)
14083 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
14084 else
14085 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight);
14086 res = VkAppendLine(sc);
14087 if (res != VKFFT_SUCCESS) return res;
14088 if (sc->outputBufferBlockNum == 1)
14089 sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
14090 else
14091 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight);
14092 res = VkAppendLine(sc);
14093 if (res != VKFFT_SUCCESS) return res;
14094 sc->tempLen = sprintf(sc->tempStr, " }\n");
14095 res = VkAppendLine(sc);
14096 if (res != VKFFT_SUCCESS) return res;
14097 }
14098 else {
14099 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]);
14100 res = VkAppendLine(sc);
14101 if (res != VKFFT_SUCCESS) return res;
14102 if (sc->outputBufferBlockNum == 1)
14103 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
14104 else
14105 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
14106 res = VkAppendLine(sc);
14107 if (res != VKFFT_SUCCESS) return res;
14108 }
14110 if (res != VKFFT_SUCCESS) return res;
14111 if (sc->zeropad[1]) {
14112 sc->tempLen = sprintf(sc->tempStr, " }\n");
14113 res = VkAppendLine(sc);
14114 if (res != VKFFT_SUCCESS) return res;
14115 }
14116 /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] >= mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
14117 sc->tempLen = sprintf(sc->tempStr, " }\n");
14118 res = VkAppendLine(sc);
14119 if (res != VKFFT_SUCCESS) return res;
14120 }*/
14121 if (sc->size[0] % sc->localSize[0] != 0) {
14122 sc->tempLen = sprintf(sc->tempStr, " }\n");
14123 res = VkAppendLine(sc);
14124 if (res != VKFFT_SUCCESS) return res;
14125 }
14126 if (sc->zeropadBluestein[1]) {
14127 sc->tempLen = sprintf(sc->tempStr, " }\n");
14128 res = VkAppendLine(sc);
14129 if (res != VKFFT_SUCCESS) return res;
14130 }
14131 }
14132 }
14133 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
14134 }
14135 else {
14136
14137 }
14138 }
14139 break;
14140 }
14141 case 140: //DCT-IV nonstrided as 8N DFT
14142 {
14143 if (!sc->writeFromRegisters) {
14144 res = appendBarrierVkFFT(sc, 1);
14145 if (res != VKFFT_SUCCESS) return res;
14146 }
14147 res = appendZeropadStart(sc);
14148 if (res != VKFFT_SUCCESS) return res;
14149 char shiftX[500] = "";
14150 if (sc->performWorkGroupShift[0])
14151 sprintf(shiftX, " + consts.workGroupShiftX ");
14152 char shiftY[500] = "";
14153 if (sc->axisSwapped) {
14154 if (sc->performWorkGroupShift[1])
14155 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
14156 }
14157 else {
14158 if (sc->performWorkGroupShift[1])
14159 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
14160 }
14161
14162 char shiftY2[100] = "";
14163 if (sc->performWorkGroupShift[1])
14164 sprintf(shiftY, " + consts.workGroupShiftY ");
14165 if (sc->fftDim < sc->fft_dim_full) {
14166 if (sc->axisSwapped) {
14167 if (!sc->reorderFourStep) {
14168 sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
14169 res = VkAppendLine(sc);
14170 if (res != VKFFT_SUCCESS) return res;
14171 }
14172 else {
14173 sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize);
14174 res = VkAppendLine(sc);
14175 if (res != VKFFT_SUCCESS) return res;
14176 }
14177 }
14178 else {
14179 sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize);
14180 res = VkAppendLine(sc);
14181 if (res != VKFFT_SUCCESS) return res;
14182 }
14183 }
14184 else {
14185 sc->tempLen = sprintf(sc->tempStr, " { \n");
14186 res = VkAppendLine(sc);
14187 if (res != VKFFT_SUCCESS) return res;
14188 }
14189 //if (sc->reorderFourStep) {
14190 if (sc->fftDim == sc->fft_dim_full) {
14191 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14192 for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {
14193 if (sc->localSize[1] == 1)
14194 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14195 else
14196 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14197 res = VkAppendLine(sc);
14198 if (res != VKFFT_SUCCESS) return res;
14199
14200 if (sc->outputStride[0] > 1)
14201 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->outputStride[0], sc->fftDim / 8, sc->outputStride[1]);
14202 else
14203 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->outputStride[1]);
14204 res = VkAppendLine(sc);
14205 if (res != VKFFT_SUCCESS) return res;
14206 if (sc->axisSwapped) {
14207 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
14208 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
14209 res = VkAppendLine(sc);
14210 if (res != VKFFT_SUCCESS) return res;
14211 }
14212 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]);
14213 res = VkAppendLine(sc);
14214 if (res != VKFFT_SUCCESS) return res;
14215 }
14216 else {
14217 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
14218 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
14219 res = VkAppendLine(sc);
14220 if (res != VKFFT_SUCCESS) return res;
14221 }
14222 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]);
14223 res = VkAppendLine(sc);
14224 if (res != VKFFT_SUCCESS) return res;
14225 }
14226 if (sc->zeropad[1]) {
14227 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
14228 res = VkAppendLine(sc);
14229 if (res != VKFFT_SUCCESS) return res;
14230 }
14231 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14232 res = VkAppendLine(sc);
14233 if (res != VKFFT_SUCCESS) return res;
14234 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
14235 if (res != VKFFT_SUCCESS) return res;
14236 sc->tempLen = sprintf(sc->tempStr, ";\n");
14237 res = VkAppendLine(sc);
14238 if (res != VKFFT_SUCCESS) return res;
14240 if (res != VKFFT_SUCCESS) return res;
14241 if (sc->writeFromRegisters) {
14242 if (sc->outputBufferBlockNum == 1)
14243 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14244 else
14245 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14246 res = VkAppendLine(sc);
14247 if (res != VKFFT_SUCCESS) return res;
14248 }
14249 else {
14250 if (sc->axisSwapped) {
14251 if (sc->outputBufferBlockNum == 1)
14252 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight);
14253 else
14254 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight);
14255 res = VkAppendLine(sc);
14256 if (res != VKFFT_SUCCESS) return res;
14257 }
14258 else {
14259 if (sc->outputBufferBlockNum == 1)
14260 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight);
14261 else
14262 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight);
14263 res = VkAppendLine(sc);
14264 if (res != VKFFT_SUCCESS) return res;
14265 }
14266 }
14268 if (res != VKFFT_SUCCESS) return res;
14269 if (sc->zeropad[1]) {
14270 sc->tempLen = sprintf(sc->tempStr, " }\n");
14271 res = VkAppendLine(sc);
14272 if (res != VKFFT_SUCCESS) return res;
14273 }
14274 sc->tempLen = sprintf(sc->tempStr, " }\n");
14275 res = VkAppendLine(sc);
14276 if (res != VKFFT_SUCCESS) return res;
14277 if (sc->axisSwapped) {
14278 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
14279 sc->tempLen = sprintf(sc->tempStr, " }");
14280 res = VkAppendLine(sc);
14281 if (res != VKFFT_SUCCESS) return res;
14282 }
14283 }
14284 else {
14285 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
14286 sc->tempLen = sprintf(sc->tempStr, " }");
14287 res = VkAppendLine(sc);
14288 if (res != VKFFT_SUCCESS) return res;
14289 }
14290 }
14291 }
14292 }
14293 }
14294 /*else {
14295 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14296 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14297 if (sc->localSize[1] == 1)
14298 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14299 else
14300 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14301 res = VkAppendLine(sc);
14302 if (res != VKFFT_SUCCESS) return res;
14303 if (sc->axisSwapped) {
14304 sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
14305 res = VkAppendLine(sc);
14306 if (res != VKFFT_SUCCESS) return res;
14307 }
14308 else {
14309 if (sc->localSize[1] == 1)
14310 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
14311 else
14312 sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize);
14313 res = VkAppendLine(sc);
14314 if (res != VKFFT_SUCCESS) return res;
14315 }
14316 if (sc->zeropad[1]) {
14317 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
14318 res = VkAppendLine(sc);
14319 if (res != VKFFT_SUCCESS) return res;
14320 }
14321 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14322 res = VkAppendLine(sc);
14323 if (res != VKFFT_SUCCESS) return res;
14324 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
14325 if (res != VKFFT_SUCCESS) return res;
14326 sc->tempLen = sprintf(sc->tempStr, ";\n");
14327 res = VkAppendLine(sc);
14328 if (res != VKFFT_SUCCESS) return res;
14329 res = appendZeropadStartReadWriteStage(sc, 0);
14330 if (res != VKFFT_SUCCESS) return res;
14331 if (sc->writeFromRegisters) {
14332 if (sc->outputBufferBlockNum == 1)
14333 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14334 else
14335 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14336 res = VkAppendLine(sc);
14337 if (res != VKFFT_SUCCESS) return res;
14338 }
14339 else {
14340 if (sc->axisSwapped) {
14341 if (sc->outputBufferBlockNum == 1)
14342 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight);
14343 else
14344 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight);
14345 res = VkAppendLine(sc);
14346 if (res != VKFFT_SUCCESS) return res;
14347 }
14348 else {
14349 if (sc->outputBufferBlockNum == 1)
14350 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight);
14351 else
14352 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight);
14353 res = VkAppendLine(sc);
14354 if (res != VKFFT_SUCCESS) return res;
14355 }
14356 }
14357 res = appendZeropadEndReadWriteStage(sc);
14358 if (res != VKFFT_SUCCESS) return res;
14359 if (sc->zeropad[1]) {
14360 sc->tempLen = sprintf(sc->tempStr, " }");
14361 res = VkAppendLine(sc);
14362 if (res != VKFFT_SUCCESS) return res;
14363 }
14364 }
14365 }
14366 }*/
14367 /*}
14368 else {
14369 if (sc->fftDim == sc->fft_dim_full) {
14370 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14371 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14372 if (sc->localSize[1] == 1)
14373 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14374 else
14375 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14376 res = VkAppendLine(sc);
14377 if (res != VKFFT_SUCCESS) return res;
14378
14379 if (sc->outputStride[0] > 1)
14380 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]);
14381 else
14382 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]);
14383 res = VkAppendLine(sc);
14384 if (res != VKFFT_SUCCESS) return res;
14385 if (sc->axisSwapped) {
14386 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
14387 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]);
14388 res = VkAppendLine(sc);
14389 if (res != VKFFT_SUCCESS) return res;
14390 }
14391 }
14392 else {
14393 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
14394 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]);
14395 res = VkAppendLine(sc);
14396 if (res != VKFFT_SUCCESS) return res;
14397 }
14398 }
14399 if (sc->zeropad[1]) {
14400 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
14401 res = VkAppendLine(sc);
14402 if (res != VKFFT_SUCCESS) return res;
14403 }
14404 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14405 res = VkAppendLine(sc);
14406 if (res != VKFFT_SUCCESS) return res;
14407 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
14408 if (res != VKFFT_SUCCESS) return res;
14409 sc->tempLen = sprintf(sc->tempStr, ";\n");
14410 res = VkAppendLine(sc);
14411 if (res != VKFFT_SUCCESS) return res;
14412 res = appendZeropadStartReadWriteStage(sc, 0);
14413 if (res != VKFFT_SUCCESS) return res;
14414 if (sc->writeFromRegisters) {
14415 if (sc->outputBufferBlockNum == 1)
14416 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14417 else
14418 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14419 res = VkAppendLine(sc);
14420 if (res != VKFFT_SUCCESS) return res;
14421 }
14422 else {
14423 if (sc->axisSwapped) {
14424 if (sc->outputBufferBlockNum == 1)
14425 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
14426 else
14427 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
14428 res = VkAppendLine(sc);
14429 if (res != VKFFT_SUCCESS) return res;
14430 }
14431 else {
14432 if (sc->outputBufferBlockNum == 1)
14433 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
14434 else
14435 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight);
14436 res = VkAppendLine(sc);
14437 if (res != VKFFT_SUCCESS) return res;
14438 }
14439 }
14440 res = appendZeropadEndReadWriteStage(sc);
14441 if (res != VKFFT_SUCCESS) return res;
14442 if (sc->zeropad[1]) {
14443 sc->tempLen = sprintf(sc->tempStr, " }\n");
14444 res = VkAppendLine(sc);
14445 if (res != VKFFT_SUCCESS) return res;
14446 }
14447 if (sc->axisSwapped) {
14448 if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
14449 sc->tempLen = sprintf(sc->tempStr, " }");
14450 res = VkAppendLine(sc);
14451 if (res != VKFFT_SUCCESS) return res;
14452 }
14453 }
14454 else {
14455 if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
14456 sc->tempLen = sprintf(sc->tempStr, " }");
14457 res = VkAppendLine(sc);
14458 if (res != VKFFT_SUCCESS) return res;
14459 }
14460 }
14461 }
14462 }
14463 }
14464 else {
14465 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14466 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14467 if (sc->localSize[1] == 1)
14468 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14469 else
14470 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread));
14471 res = VkAppendLine(sc);
14472 if (res != VKFFT_SUCCESS) return res;
14473 if (sc->axisSwapped) {
14474 sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
14475 res = VkAppendLine(sc);
14476 if (res != VKFFT_SUCCESS) return res;
14477 }
14478 else {
14479 sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
14480 res = VkAppendLine(sc);
14481 if (res != VKFFT_SUCCESS) return res;
14482 }
14483 if (sc->zeropad[1]) {
14484 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
14485 res = VkAppendLine(sc);
14486 if (res != VKFFT_SUCCESS) return res;
14487 }
14488 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14489 res = VkAppendLine(sc);
14490 if (res != VKFFT_SUCCESS) return res;
14491 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
14492 if (res != VKFFT_SUCCESS) return res;
14493 sc->tempLen = sprintf(sc->tempStr, ";\n");
14494 res = VkAppendLine(sc);
14495 if (res != VKFFT_SUCCESS) return res;
14496 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
14497 res = appendZeropadStartReadWriteStage(sc, 0);
14498 if (res != VKFFT_SUCCESS) return res;
14499 if (sc->writeFromRegisters) {
14500 if (sc->outputBufferBlockNum == 1)
14501 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14502 else
14503 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14504 res = VkAppendLine(sc);
14505 if (res != VKFFT_SUCCESS) return res;
14506 }
14507 else {
14508 if (sc->axisSwapped) {
14509 if (sc->outputBufferBlockNum == 1)
14510 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight);
14511 else
14512 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight);
14513 res = VkAppendLine(sc);
14514 if (res != VKFFT_SUCCESS) return res;
14515 }
14516 else {
14517 if (sc->outputBufferBlockNum == 1)
14518 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight);
14519 else
14520 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight);
14521 res = VkAppendLine(sc);
14522 if (res != VKFFT_SUCCESS) return res;
14523 }
14524 }
14525 appendZeropadEndReadWriteStage(sc);
14526 if (res != VKFFT_SUCCESS) return res;
14527 if (sc->zeropad[1]) {
14528 sc->tempLen = sprintf(sc->tempStr, " }\n");
14529 res = VkAppendLine(sc);
14530 if (res != VKFFT_SUCCESS) return res;
14531 }
14532 }
14533 }
14534 }
14535 }*/
14536 sc->tempLen = sprintf(sc->tempStr, " }\n");
14537 res = VkAppendLine(sc);
14538 if (res != VKFFT_SUCCESS) return res;
14539 break;
14540 }
14541 case 141: //DCT-IV strided as 8N DFT
14542 {
14543 if (!sc->writeFromRegisters) {
14544 res = appendBarrierVkFFT(sc, 1);
14545 if (res != VKFFT_SUCCESS) return res;
14546 }
14547 res = appendZeropadStart(sc);
14548 if (res != VKFFT_SUCCESS) return res;
14549 char shiftX[500] = "";
14550 if (sc->performWorkGroupShift[0])
14551 sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
14552 if (sc->fftDim != sc->fft_dim_full)
14553 sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
14554 else
14555 sc->tempLen = sprintf(sc->tempStr, " {\n");
14556 res = VkAppendLine(sc);
14557 if (res != VKFFT_SUCCESS) return res;
14558 //if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) {
14559 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14560 for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {
14561 if (sc->fftDim == sc->fft_dim_full)
14562 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
14563 else
14564 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim));
14565 res = VkAppendLine(sc);
14566 if (res != VKFFT_SUCCESS) return res;
14567 sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8);
14568 res = VkAppendLine(sc);
14569 if (res != VKFFT_SUCCESS) return res;
14570 if (sc->zeropad[1]) {
14571 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
14572 res = VkAppendLine(sc);
14573 if (res != VKFFT_SUCCESS) return res;
14574 }
14575 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14576 res = VkAppendLine(sc);
14577 if (res != VKFFT_SUCCESS) return res;
14578 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
14579 res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch);
14580 if (res != VKFFT_SUCCESS) return res;
14581 sc->tempLen = sprintf(sc->tempStr, ";\n");
14582 res = VkAppendLine(sc);
14583 if (res != VKFFT_SUCCESS) return res;
14584 if (sc->outputBufferBlockNum == 1)
14585 sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
14586 else
14587 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
14588 res = VkAppendLine(sc);
14589 if (res != VKFFT_SUCCESS) return res;
14590
14591 if (sc->zeropad[1]) {
14592 sc->tempLen = sprintf(sc->tempStr, " }\n");
14593 res = VkAppendLine(sc);
14594 if (res != VKFFT_SUCCESS) return res;
14595 }
14596 sc->tempLen = sprintf(sc->tempStr, " }\n");
14597 res = VkAppendLine(sc);
14598 if (res != VKFFT_SUCCESS) return res;
14599 }
14600 }
14601 /*}
14602 else {
14603 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14604 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14605 if (sc->zeropad[1]) {
14606 sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim);
14607 res = VkAppendLine(sc);
14608 if (res != VKFFT_SUCCESS) return res;
14609 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]);
14610 res = VkAppendLine(sc);
14611 if (res != VKFFT_SUCCESS) return res;
14612 }
14613 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14614 res = VkAppendLine(sc);
14615 if (res != VKFFT_SUCCESS) return res;
14616 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
14617 sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim);
14618 res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
14619 if (res != VKFFT_SUCCESS) return res;
14620 sc->tempLen = sprintf(sc->tempStr, ";\n");
14621 res = VkAppendLine(sc);
14622 if (res != VKFFT_SUCCESS) return res;
14623 //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch);
14624 if (sc->writeFromRegisters) {
14625 if (sc->outputBufferBlockNum == 1)
14626 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14627 else
14628 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14629 res = VkAppendLine(sc);
14630 if (res != VKFFT_SUCCESS) return res;
14631 }
14632 else {
14633 if (sc->outputBufferBlockNum == 1)
14634 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
14635 else
14636 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
14637 res = VkAppendLine(sc);
14638 if (res != VKFFT_SUCCESS) return res;
14639 }
14640 if (sc->zeropad[1]) {
14641 sc->tempLen = sprintf(sc->tempStr, " }\n");
14642 res = VkAppendLine(sc);
14643 if (res != VKFFT_SUCCESS) return res;
14644 }
14645 }
14646 }
14647 }*/
14648 sc->tempLen = sprintf(sc->tempStr, " }\n");
14649 res = VkAppendLine(sc);
14650 if (res != VKFFT_SUCCESS) return res;
14651 break;
14652
14653 }
14654 case 142://DCT-IV nonstrided as 2xN/2 DCT-II
14655 {
14656 if (!sc->writeFromRegisters) {
14657 res = appendBarrierVkFFT(sc, 1);
14658 if (res != VKFFT_SUCCESS) return res;
14659 }
14660 //res = appendZeropadStart(sc);
14661 //if (res != VKFFT_SUCCESS) return res;
14662 char shiftX[500] = "";
14663 if (sc->performWorkGroupShift[0])
14664 sprintf(shiftX, " + consts.workGroupShiftX ");
14665 char shiftY[500] = "";
14666 if (sc->performWorkGroupShift[1])
14667 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
14668 char shiftY2[500] = "";
14669 if (sc->performWorkGroupShift[1])
14670 sprintf(shiftY2, " + consts.workGroupShiftY ");
14671 if (sc->reorderFourStep) {
14672 //Not implemented
14673 }
14674 else {
14675 //appendBarrierVkFFT(sc, 1);
14676 //appendZeropadStart(sc);
14677 if (sc->fftDim == sc->fft_dim_full) {
14679 uint64_t maxBluesteinCutOff = 1;
14680 if (sc->zeropadBluestein[1]) {
14681 if (sc->axisSwapped)
14682 maxBluesteinCutOff = sc->fftDim * sc->localSize[0];
14683 else
14684 maxBluesteinCutOff = sc->fftDim * sc->localSize[1];
14685 }
14686 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14687 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
14688 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14689 if (sc->localSize[1] == 1)
14690 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14691 else
14692 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14693 res = VkAppendLine(sc);
14694 if (res != VKFFT_SUCCESS) return res;
14695 if (sc->zeropadBluestein[1]) {
14696 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
14697 res = VkAppendLine(sc);
14698 if (res != VKFFT_SUCCESS) return res;
14699 }
14700 if (sc->axisSwapped) {
14701 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
14702 res = VkAppendLine(sc);
14703 if (res != VKFFT_SUCCESS) return res;
14704 }
14705 else {
14706 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
14707 res = VkAppendLine(sc);
14708 if (res != VKFFT_SUCCESS) return res;
14709 }
14710
14711 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
14712 res = VkAppendLine(sc);
14713 if (res != VKFFT_SUCCESS) return res;
14714 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID %% %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->fftDim);
14715 res = VkAppendLine(sc);
14716 if (res != VKFFT_SUCCESS) return res;
14717 if (sc->zeropadBluestein[1]) {
14718 sc->tempLen = sprintf(sc->tempStr, " }\n");
14719 res = VkAppendLine(sc);
14720 if (res != VKFFT_SUCCESS) return res;
14721 }
14722 }
14723 }
14724 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14725 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14726 if (sc->localSize[1] == 1)
14727 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14728 else
14729 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14730 res = VkAppendLine(sc);
14731 if (res != VKFFT_SUCCESS) return res;
14732 if (sc->axisSwapped) {
14733 if (sc->size[1] % sc->localSize[0] != 0) {
14734 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]);
14735 res = VkAppendLine(sc);
14736 if (res != VKFFT_SUCCESS) return res;
14737 }
14738 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
14739 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
14740 res = VkAppendLine(sc);
14741 if (res != VKFFT_SUCCESS) return res;
14742 }
14743 }
14744 else {
14745 if (sc->size[1] % sc->localSize[1] != 0) {
14746 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]);
14747 res = VkAppendLine(sc);
14748 if (res != VKFFT_SUCCESS) return res;
14749 }
14750 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[1]) {
14751 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]);
14752 res = VkAppendLine(sc);
14753 if (res != VKFFT_SUCCESS) return res;
14754 }
14755 }
14756 sc->tempLen = sprintf(index_x, "combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", sc->fftDim, sc->fftDim, sc->outputStride[1]);
14757 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14758 res = VkAppendLine(sc);
14759 if (res != VKFFT_SUCCESS) return res;
14760 res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14761 sc->tempLen = sprintf(sc->tempStr, ";\n");
14762 res = VkAppendLine(sc);
14763 if (res != VKFFT_SUCCESS) return res;
14765 if (res != VKFFT_SUCCESS) return res;
14766 if (sc->LUT) {
14767 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT4LUT, sc->fftDim);
14768 res = VkAppendLine(sc);
14769 if (res != VKFFT_SUCCESS) return res;
14770 }
14771 else {
14772 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", cosDef, -double_PI / 8 / sc->fftDim, LFending, sc->fftDim);
14773 res = VkAppendLine(sc);
14774 if (res != VKFFT_SUCCESS) return res;
14775 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", sinDef, -double_PI / 8 / sc->fftDim, LFending, sc->fftDim);
14776 res = VkAppendLine(sc);
14777 if (res != VKFFT_SUCCESS) return res;
14778 }
14779 if (sc->zeropad[1]) {
14780 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
14781 res = VkAppendLine(sc);
14782 if (res != VKFFT_SUCCESS) return res;
14783 }
14784 if (sc->outputBufferBlockNum == 1)
14785 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14786 else
14787 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14788 res = VkAppendLine(sc);
14789 if (res != VKFFT_SUCCESS) return res;
14790 if (sc->zeropad[1]) {
14791 sc->tempLen = sprintf(sc->tempStr, " }\n");
14792 res = VkAppendLine(sc);
14793 if (res != VKFFT_SUCCESS) return res;
14794 }
14795 sc->tempLen = sprintf(index_x, "%" PRIu64 " - combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", 2 * sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->outputStride[1]);
14796 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14797 res = VkAppendLine(sc);
14798 if (res != VKFFT_SUCCESS) return res;
14799 res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14800 sc->tempLen = sprintf(sc->tempStr, ";\n");
14801 res = VkAppendLine(sc);
14802 if (sc->zeropad[1]) {
14803 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
14804 res = VkAppendLine(sc);
14805 if (res != VKFFT_SUCCESS) return res;
14806 }
14807 if (sc->outputBufferBlockNum == 1)
14808 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14809 else
14810 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14811 res = VkAppendLine(sc);
14812 if (res != VKFFT_SUCCESS) return res;
14814 if (res != VKFFT_SUCCESS) return res;
14815 if (sc->zeropad[1]) {
14816 sc->tempLen = sprintf(sc->tempStr, " }\n");
14817 res = VkAppendLine(sc);
14818 if (res != VKFFT_SUCCESS) return res;
14819 }
14820 if (sc->axisSwapped) {
14821 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
14822 sc->tempLen = sprintf(sc->tempStr, " }\n");
14823 res = VkAppendLine(sc);
14824 if (res != VKFFT_SUCCESS) return res;
14825 }
14826 }
14827 else {
14828 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[1])
14829 {
14830 sc->tempLen = sprintf(sc->tempStr, " }\n");
14831 res = VkAppendLine(sc);
14832 if (res != VKFFT_SUCCESS) return res;
14833 }
14834 }
14835 if (sc->axisSwapped) {
14836 if (sc->size[1] % sc->localSize[0] != 0) {
14837 sc->tempLen = sprintf(sc->tempStr, " }\n");
14838 res = VkAppendLine(sc);
14839 if (res != VKFFT_SUCCESS) return res;
14840 }
14841 }
14842 else {
14843 if (sc->size[1] % sc->localSize[1] != 0) {
14844 sc->tempLen = sprintf(sc->tempStr, " }\n");
14845 res = VkAppendLine(sc);
14846 if (res != VKFFT_SUCCESS) return res;
14847 }
14848 }
14849 }
14850 }
14851 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
14852 }
14853 else {
14854
14855 }
14856 }
14857 break;
14858 }
14859 case 143://DCT-IV strided as 2xN/2 DCT-II
14860 {
14861 if (!sc->writeFromRegisters) {
14862 res = appendBarrierVkFFT(sc, 1);
14863 if (res != VKFFT_SUCCESS) return res;
14864 }
14865 //res = appendZeropadStart(sc);
14866 //if (res != VKFFT_SUCCESS) return res;
14867 char shiftX[500] = "";
14868 if (sc->performWorkGroupShift[0])
14869 sprintf(shiftX, " + consts.workGroupShiftX ");
14870 char shiftX2[500] = "";
14871 if (sc->performWorkGroupShift[0])
14872 sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
14873 char shiftY[500] = "";
14874 if (sc->performWorkGroupShift[1])
14875 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
14876 char shiftY2[500] = "";
14877 if (sc->performWorkGroupShift[1])
14878 sprintf(shiftY2, " + consts.workGroupShiftY ");
14879 if (sc->reorderFourStep) {
14880 //Not implemented
14881 }
14882 else {
14883 //appendBarrierVkFFT(sc, 1);
14884 //appendZeropadStart(sc);
14885 if (sc->fftDim == sc->fft_dim_full) {
14887 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14888 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
14889 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14890 if (sc->localSize[1] == 1)
14891 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14892 else
14893 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14894 res = VkAppendLine(sc);
14895 if (res != VKFFT_SUCCESS) return res;
14896 if (sc->zeropadBluestein[1]) {
14897 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
14898 res = VkAppendLine(sc);
14899 if (res != VKFFT_SUCCESS) return res;
14900 }
14901 sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]);
14902 res = VkAppendLine(sc);
14903 if (res != VKFFT_SUCCESS) return res;
14904
14905 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
14906 res = VkAppendLine(sc);
14907 if (res != VKFFT_SUCCESS) return res;
14908 sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID / %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->localSize[0]);
14909 res = VkAppendLine(sc);
14910 if (res != VKFFT_SUCCESS) return res;
14911 if (sc->zeropadBluestein[1]) {
14912 sc->tempLen = sprintf(sc->tempStr, " }\n");
14913 res = VkAppendLine(sc);
14914 if (res != VKFFT_SUCCESS) return res;
14915 }
14916 }
14917 }
14918 for (uint64_t k = 0; k < sc->registerBoost; k++) {
14919 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
14920 if (sc->localSize[1] == 1)
14921 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
14922 else
14923 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
14924 res = VkAppendLine(sc);
14925 if (res != VKFFT_SUCCESS) return res;
14926 if (sc->zeropadBluestein[1]) {
14927 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
14928 res = VkAppendLine(sc);
14929 if (res != VKFFT_SUCCESS) return res;
14930 }
14931 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
14932 sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
14933 res = VkAppendLine(sc);
14934 if (res != VKFFT_SUCCESS) return res;
14935 }
14936 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
14937 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
14938 res = VkAppendLine(sc);
14939 if (res != VKFFT_SUCCESS) return res;
14940 }
14941 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14942 res = VkAppendLine(sc);
14943 if (res != VKFFT_SUCCESS) return res;
14944 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
14945 sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[1]);
14946 res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
14947 if (res != VKFFT_SUCCESS) return res;
14948 sc->tempLen = sprintf(sc->tempStr, ";\n");
14949 res = VkAppendLine(sc);
14950 if (res != VKFFT_SUCCESS) return res;
14952 if (res != VKFFT_SUCCESS) return res;
14953 if (sc->LUT) {
14954 sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT4LUT, sc->localSize[0]);
14955 res = VkAppendLine(sc);
14956 if (res != VKFFT_SUCCESS) return res;
14957 }
14958 else {
14959 sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17f%s * (2*(combinedID / %" PRIu64 ")+1) );\n", cosDef, -double_PI / 8 / sc->fftDim, LFending, sc->localSize[0]);
14960 res = VkAppendLine(sc);
14961 if (res != VKFFT_SUCCESS) return res;
14962 sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17f%s * (2*(combinedID / %" PRIu64 ")+1) );\n", sinDef, -double_PI / 8 / sc->fftDim, LFending, sc->localSize[0]);
14963 res = VkAppendLine(sc);
14964 if (res != VKFFT_SUCCESS) return res;
14965 }
14966 if (sc->zeropad[1]) {
14967 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
14968 res = VkAppendLine(sc);
14969 if (res != VKFFT_SUCCESS) return res;
14970 }
14971 if (sc->outputBufferBlockNum == 1)
14972 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14973 else
14974 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14975 res = VkAppendLine(sc);
14976 if (res != VKFFT_SUCCESS) return res;
14977 if (sc->zeropad[1]) {
14978 sc->tempLen = sprintf(sc->tempStr, " }\n");
14979 res = VkAppendLine(sc);
14980 if (res != VKFFT_SUCCESS) return res;
14981 }
14982 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
14983 res = VkAppendLine(sc);
14984 if (res != VKFFT_SUCCESS) return res;
14985 sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
14986 sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * 2 * sc->min_registers_per_thread) * sc->localSize[1]);
14987 res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
14988 if (res != VKFFT_SUCCESS) return res;
14989 sc->tempLen = sprintf(sc->tempStr, ";\n");
14990 res = VkAppendLine(sc);
14991 if (res != VKFFT_SUCCESS) return res;
14992 if (sc->zeropad[1]) {
14993 sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
14994 res = VkAppendLine(sc);
14995 if (res != VKFFT_SUCCESS) return res;
14996 }
14997 if (sc->outputBufferBlockNum == 1)
14998 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
14999 else
15000 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
15001 res = VkAppendLine(sc);
15002 if (res != VKFFT_SUCCESS) return res;
15004 if (res != VKFFT_SUCCESS) return res;
15005 if (sc->zeropad[1]) {
15006 sc->tempLen = sprintf(sc->tempStr, " }\n");
15007 res = VkAppendLine(sc);
15008 if (res != VKFFT_SUCCESS) return res;
15009 }
15010 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= (sc->fftDim) * sc->localSize[0]) {
15011 sc->tempLen = sprintf(sc->tempStr, " }\n");
15012 res = VkAppendLine(sc);
15013 if (res != VKFFT_SUCCESS) return res;
15014 }
15015 if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
15016 sc->tempLen = sprintf(sc->tempStr, " }\n");
15017 res = VkAppendLine(sc);
15018 if (res != VKFFT_SUCCESS) return res;
15019 }
15020 if (sc->zeropadBluestein[1]) {
15021 sc->tempLen = sprintf(sc->tempStr, " }\n");
15022 res = VkAppendLine(sc);
15023 if (res != VKFFT_SUCCESS) return res;
15024 }
15025 }
15026 }
15027 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
15028 }
15029 else {
15030
15031 }
15032 }
15033 break;
15034 }
15035 case 144://odd DCT-IV nonstrided as N FFT
15036 {
15037 if (!sc->writeFromRegisters) {
15038 res = appendBarrierVkFFT(sc, 1);
15039 if (res != VKFFT_SUCCESS) return res;
15040 }
15041 //res = appendZeropadStart(sc);
15042 //if (res != VKFFT_SUCCESS) return res;
15043 char shiftX[500] = "";
15044 if (sc->performWorkGroupShift[0])
15045 sprintf(shiftX, " + consts.workGroupShiftX ");
15046 char shiftY[500] = "";
15047 if (sc->performWorkGroupShift[1])
15048 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
15049 char shiftY2[500] = "";
15050 if (sc->performWorkGroupShift[1])
15051 sprintf(shiftY2, " + consts.workGroupShiftY ");
15052 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
15053 if (sc->reorderFourStep) {
15054 //Not implemented
15055 }
15056 else {
15057 //appendBarrierVkFFT(sc, 1);
15058 //appendZeropadStart(sc);
15059 if (sc->fftDim == sc->fft_dim_full) {
15061 for (uint64_t k = 0; k < sc->registerBoost; k++) {
15062 if (sc->mergeSequencesR2C) {
15063 if (sc->axisSwapped) {
15064 sc->tempLen = sprintf(sc->tempStr, "\
15065 if (%s==0)\n\
15066 {\n\
15067 sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\
15069 res = VkAppendLine(sc);
15070 if (res != VKFFT_SUCCESS) return res;
15071 //res = appendZeropadEnd(sc);
15072 //if (res != VKFFT_SUCCESS) return res;
15073 res = appendBarrierVkFFT(sc, 1);
15074 if (res != VKFFT_SUCCESS) return res;
15075 //res = appendZeropadStart(sc);
15076 //if (res != VKFFT_SUCCESS) return res;
15077 }
15078 else {
15079 sc->tempLen = sprintf(sc->tempStr, "\
15080 if (%s==0)\n\
15081 {\n\
15082 sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\
15084 res = VkAppendLine(sc);
15085 if (res != VKFFT_SUCCESS) return res;
15086 //res = appendZeropadEnd(sc);
15087 //if (res != VKFFT_SUCCESS) return res;
15088 res = appendBarrierVkFFT(sc, 1);
15089 if (res != VKFFT_SUCCESS) return res;
15090 //res = appendZeropadStart(sc);
15091 //if (res != VKFFT_SUCCESS) return res;
15092 }
15093 }
15094 //uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
15095 //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread);
15096 for (uint64_t i = 0; i < mult*sc->min_registers_per_thread; i++) {
15097 if (sc->localSize[1] == 1)
15098 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
15099 else
15100 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
15101 res = VkAppendLine(sc);
15102 if (res != VKFFT_SUCCESS) return res;
15103
15104 sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, sc->outputStride[1]);
15105 res = VkAppendLine(sc);
15106 if (res != VKFFT_SUCCESS) return res;
15107
15108 if (sc->axisSwapped) {
15109 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
15110 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
15111 res = VkAppendLine(sc);
15112 if (res != VKFFT_SUCCESS) return res;
15113 }
15114 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= mult * sc->fftDim * sc->localSize[0]) {
15115 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[0]);
15116 res = VkAppendLine(sc);
15117 if (res != VKFFT_SUCCESS) return res;
15118 }
15119 }
15120 else {
15121 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
15122 sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
15123 res = VkAppendLine(sc);
15124 if (res != VKFFT_SUCCESS) return res;
15125 }
15126 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= mult * sc->fftDim * sc->localSize[1]) {
15127 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[1]);
15128 res = VkAppendLine(sc);
15129 if (res != VKFFT_SUCCESS) return res;
15130 }
15131 }
15132 if (sc->zeropad[1]) {
15133 sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]);
15134 res = VkAppendLine(sc);
15135 if (res != VKFFT_SUCCESS) return res;
15136 }
15137 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
15138 res = VkAppendLine(sc);
15139 if (res != VKFFT_SUCCESS) return res;
15140 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
15141 sc->tempLen = sprintf(sc->tempStr, ";\n");
15142 res = VkAppendLine(sc);
15143 if (res != VKFFT_SUCCESS) return res;
15145 if (res != VKFFT_SUCCESS) return res;
15146 if (sc->writeFromRegisters) {
15147 //not working yet
15148 if (sc->outputBufferBlockNum == 1)
15149 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
15150 else
15151 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight);
15152 res = VkAppendLine(sc);
15153 if (res != VKFFT_SUCCESS) return res;
15154 }
15155 else {
15156 sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID %% %" PRIu64 ";\n", sc->fftDim);
15157 res = VkAppendLine(sc);
15158 if (res != VKFFT_SUCCESS) return res;
15159 sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim/4);
15160 res = VkAppendLine(sc);
15161 if (res != VKFFT_SUCCESS) return res;
15162 if (sc->mergeSequencesR2C) {
15163 if (sc->axisSwapped) {
15164 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15165 res = VkAppendLine(sc);
15166 if (res != VKFFT_SUCCESS) return res;
15167 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult*sc->fftDim, sc->fftDim, mult * sc->fftDim);
15168 res = VkAppendLine(sc);
15169 if (res != VKFFT_SUCCESS) return res;
15170 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15171 res = VkAppendLine(sc);
15172 if (res != VKFFT_SUCCESS) return res;
15173 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15174 res = VkAppendLine(sc);
15175 if (res != VKFFT_SUCCESS) return res;
15176 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15177 res = VkAppendLine(sc);
15178 if (res != VKFFT_SUCCESS) return res;
15179 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15180 res = VkAppendLine(sc);
15181 if (res != VKFFT_SUCCESS) return res;
15182 sc->tempLen = sprintf(sc->tempStr, "}\n");
15183 res = VkAppendLine(sc);
15184 if (res != VKFFT_SUCCESS) return res;
15185 }
15186 else {
15187 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15188 res = VkAppendLine(sc);
15189 if (res != VKFFT_SUCCESS) return res;
15190 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15191 res = VkAppendLine(sc);
15192 if (res != VKFFT_SUCCESS) return res;
15193 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15194 res = VkAppendLine(sc);
15195 if (res != VKFFT_SUCCESS) return res;
15196 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15197 res = VkAppendLine(sc);
15198 if (res != VKFFT_SUCCESS) return res;
15199 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15200 res = VkAppendLine(sc);
15201 if (res != VKFFT_SUCCESS) return res;
15202 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim);
15203 res = VkAppendLine(sc);
15204 if (res != VKFFT_SUCCESS) return res;
15205 sc->tempLen = sprintf(sc->tempStr, "}\n");
15206 res = VkAppendLine(sc);
15207 if (res != VKFFT_SUCCESS) return res;
15208 }
15209 }
15210 else {
15211 if (!sc->axisSwapped)
15212 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], sc->fftDim);
15213 else
15214 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], sc->fftDim);
15215 res = VkAppendLine(sc);
15216 if (res != VKFFT_SUCCESS) return res;
15217 }
15218 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\
15219 %s.x = -%s.x;\n\
15220 else\n\
15221 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15222 res = VkAppendLine(sc);
15223 if (res != VKFFT_SUCCESS) return res;
15224 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15225 %s.x += %s.y;\n\
15226 else\n\
15227 %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15228 res = VkAppendLine(sc);
15229 if (res != VKFFT_SUCCESS) return res;
15230 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15231
15232
15233 res = VkAppendLine(sc);
15234 if (res != VKFFT_SUCCESS) return res;
15235 sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim/2, sc->fftDim/4);
15236 res = VkAppendLine(sc);
15237 if (res != VKFFT_SUCCESS) return res;
15238 if (sc->mergeSequencesR2C) {
15239 if (sc->axisSwapped) {
15240 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15241 res = VkAppendLine(sc);
15242 if (res != VKFFT_SUCCESS) return res;
15243 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15244 res = VkAppendLine(sc);
15245 if (res != VKFFT_SUCCESS) return res;
15246 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15247 res = VkAppendLine(sc);
15248 if (res != VKFFT_SUCCESS) return res;
15249 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15250 res = VkAppendLine(sc);
15251 if (res != VKFFT_SUCCESS) return res;
15252 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15253 res = VkAppendLine(sc);
15254 if (res != VKFFT_SUCCESS) return res;
15255 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15256 res = VkAppendLine(sc);
15257 if (res != VKFFT_SUCCESS) return res;
15258 sc->tempLen = sprintf(sc->tempStr, "}\n");
15259 res = VkAppendLine(sc);
15260 if (res != VKFFT_SUCCESS) return res;
15261 }
15262 else {
15263 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15264 res = VkAppendLine(sc);
15265 if (res != VKFFT_SUCCESS) return res;
15266 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15267 res = VkAppendLine(sc);
15268 if (res != VKFFT_SUCCESS) return res;
15269 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15270 res = VkAppendLine(sc);
15271 if (res != VKFFT_SUCCESS) return res;
15272 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15273 res = VkAppendLine(sc);
15274 if (res != VKFFT_SUCCESS) return res;
15275 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15276 res = VkAppendLine(sc);
15277 if (res != VKFFT_SUCCESS) return res;
15278 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim);
15279 res = VkAppendLine(sc);
15280 if (res != VKFFT_SUCCESS) return res;
15281 sc->tempLen = sprintf(sc->tempStr, "}\n");
15282 res = VkAppendLine(sc);
15283 if (res != VKFFT_SUCCESS) return res;
15284 }
15285 }
15286 else {
15287 if (!sc->axisSwapped)
15288 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim);
15289 else
15290 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim);
15291 res = VkAppendLine(sc);
15292 if (res != VKFFT_SUCCESS) return res;
15293 }
15294 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15295 %s.x = -%s.x;\n\
15296 else\n\
15297 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15298 res = VkAppendLine(sc);
15299 if (res != VKFFT_SUCCESS) return res;
15300 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15301 %s.x -= %s.y;\n\
15302 else\n\
15303 %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15304 res = VkAppendLine(sc);
15305 if (res != VKFFT_SUCCESS) return res;
15306 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15307 res = VkAppendLine(sc);
15308 if (res != VKFFT_SUCCESS) return res;
15309
15310
15311 sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2);
15312 res = VkAppendLine(sc);
15313 if (res != VKFFT_SUCCESS) return res;
15314 if (sc->mergeSequencesR2C) {
15315 if (sc->axisSwapped) {
15316 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15317 res = VkAppendLine(sc);
15318 if (res != VKFFT_SUCCESS) return res;
15319 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15320 res = VkAppendLine(sc);
15321 if (res != VKFFT_SUCCESS) return res;
15322 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15323 res = VkAppendLine(sc);
15324 if (res != VKFFT_SUCCESS) return res;
15325 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15326 res = VkAppendLine(sc);
15327 if (res != VKFFT_SUCCESS) return res;
15328 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15329 res = VkAppendLine(sc);
15330 if (res != VKFFT_SUCCESS) return res;
15331 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15332 res = VkAppendLine(sc);
15333 if (res != VKFFT_SUCCESS) return res;
15334 sc->tempLen = sprintf(sc->tempStr, "}\n");
15335 res = VkAppendLine(sc);
15336 if (res != VKFFT_SUCCESS) return res;
15337 }
15338 else {
15339 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15340 res = VkAppendLine(sc);
15341 if (res != VKFFT_SUCCESS) return res;
15342 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15343 res = VkAppendLine(sc);
15344 if (res != VKFFT_SUCCESS) return res;
15345 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15346 res = VkAppendLine(sc);
15347 if (res != VKFFT_SUCCESS) return res;
15348 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15349 res = VkAppendLine(sc);
15350 if (res != VKFFT_SUCCESS) return res;
15351 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15352 res = VkAppendLine(sc);
15353 if (res != VKFFT_SUCCESS) return res;
15354 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim);
15355 res = VkAppendLine(sc);
15356 if (res != VKFFT_SUCCESS) return res;
15357 sc->tempLen = sprintf(sc->tempStr, "}\n");
15358 res = VkAppendLine(sc);
15359 if (res != VKFFT_SUCCESS) return res;
15360 }
15361 }
15362 else {
15363 if (!sc->axisSwapped)
15364 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim);
15365 else
15366 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim);
15367 res = VkAppendLine(sc);
15368 if (res != VKFFT_SUCCESS) return res;
15369 }
15370 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15371 %s.x = -%s.x;\n\
15372 else\n\
15373 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15374 res = VkAppendLine(sc);
15375 if (res != VKFFT_SUCCESS) return res;
15376 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15377 %s.x += %s.y;\n\
15378 else\n\
15379 %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15380 res = VkAppendLine(sc);
15381 if (res != VKFFT_SUCCESS) return res;
15382 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15383 res = VkAppendLine(sc);
15384 if (res != VKFFT_SUCCESS) return res;
15385
15386
15387 sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3*sc->fftDim / 4);
15388 res = VkAppendLine(sc);
15389 if (res != VKFFT_SUCCESS) return res;
15390 if (sc->mergeSequencesR2C) {
15391 if (sc->axisSwapped) {
15392 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15393 res = VkAppendLine(sc);
15394 if (res != VKFFT_SUCCESS) return res;
15395 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15396 res = VkAppendLine(sc);
15397 if (res != VKFFT_SUCCESS) return res;
15398 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15399 res = VkAppendLine(sc);
15400 if (res != VKFFT_SUCCESS) return res;
15401 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15402 res = VkAppendLine(sc);
15403 if (res != VKFFT_SUCCESS) return res;
15404 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15405 res = VkAppendLine(sc);
15406 if (res != VKFFT_SUCCESS) return res;
15407 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15408 res = VkAppendLine(sc);
15409 if (res != VKFFT_SUCCESS) return res;
15410 sc->tempLen = sprintf(sc->tempStr, "}\n");
15411 res = VkAppendLine(sc);
15412 if (res != VKFFT_SUCCESS) return res;
15413 }
15414 else {
15415 sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim);
15416 res = VkAppendLine(sc);
15417 if (res != VKFFT_SUCCESS) return res;
15418 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15419 res = VkAppendLine(sc);
15420 if (res != VKFFT_SUCCESS) return res;
15421 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15422 res = VkAppendLine(sc);
15423 if (res != VKFFT_SUCCESS) return res;
15424 sc->tempLen = sprintf(sc->tempStr, "}else{\n");
15425 res = VkAppendLine(sc);
15426 if (res != VKFFT_SUCCESS) return res;
15427 sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15428 res = VkAppendLine(sc);
15429 if (res != VKFFT_SUCCESS) return res;
15430 sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim);
15431 res = VkAppendLine(sc);
15432 if (res != VKFFT_SUCCESS) return res;
15433 sc->tempLen = sprintf(sc->tempStr, "}\n");
15434 res = VkAppendLine(sc);
15435 if (res != VKFFT_SUCCESS) return res;
15436 }
15437 }
15438 else {
15439 if (!sc->axisSwapped)
15440 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim);
15441 else
15442 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim);
15443 res = VkAppendLine(sc);
15444 if (res != VKFFT_SUCCESS) return res;
15445 }
15446 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15447 %s.x = -%s.x;\n\
15448 else\n\
15449 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15450 res = VkAppendLine(sc);
15451 if (res != VKFFT_SUCCESS) return res;
15452 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15453 %s.x -= %s.y;\n\
15454 else\n\
15455 %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15456 res = VkAppendLine(sc);
15457 if (res != VKFFT_SUCCESS) return res;
15458 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15459 res = VkAppendLine(sc);
15460 if (res != VKFFT_SUCCESS) return res;
15461 sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending);
15462 res = VkAppendLine(sc);
15463 if (res != VKFFT_SUCCESS) return res;
15464 if (sc->outputBufferBlockNum == 1)
15465 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight);
15466 else
15467 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight);
15468 res = VkAppendLine(sc);
15469 if (res != VKFFT_SUCCESS) return res;
15470 }
15472 if (res != VKFFT_SUCCESS) return res;
15473 if (sc->zeropad[1]) {
15474 sc->tempLen = sprintf(sc->tempStr, " }\n");
15475 res = VkAppendLine(sc);
15476 if (res != VKFFT_SUCCESS) return res;
15477 }
15478 if (sc->axisSwapped) {
15479 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= mult * sc->fftDim * sc->localSize[0]) {
15480 sc->tempLen = sprintf(sc->tempStr, " }\n");
15481 res = VkAppendLine(sc);
15482 if (res != VKFFT_SUCCESS) return res;
15483 }
15484 }
15485 else {
15486 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= mult * sc->fftDim * sc->localSize[1])
15487 {
15488 sc->tempLen = sprintf(sc->tempStr, " }\n");
15489 res = VkAppendLine(sc);
15490 if (res != VKFFT_SUCCESS) return res;
15491 }
15492 }
15493 if (sc->axisSwapped) {
15494 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
15495 sc->tempLen = sprintf(sc->tempStr, " }\n");
15496 res = VkAppendLine(sc);
15497 if (res != VKFFT_SUCCESS) return res;
15498 }
15499 }
15500 else {
15501 if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
15502 sc->tempLen = sprintf(sc->tempStr, " }\n");
15503 res = VkAppendLine(sc);
15504 if (res != VKFFT_SUCCESS) return res;
15505 }
15506 }
15507 }
15508 }
15509 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
15510 /*for (uint64_t k = 0; k < sc->registerBoost; k++) {
15511 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
15512 if (sc->localSize[1] == 1)
15513 sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
15514 else
15515 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
15516 res = VkAppendLine(sc);
15517 if (res != VKFFT_SUCCESS) return res;
15518 sc->tempLen = sprintf(sc->tempStr, " if(%s + %" PRIu64 " < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], (sc->fftDim-1)/2);
15519 res = VkAppendLine(sc);
15520 if (res != VKFFT_SUCCESS) return res;
15521 if (sc->axisSwapped) {
15522 sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1)* sharedStride + (combinedID / %" PRIu64 ")];\n",sc->fftDim, sc->fftDim);
15523 res = VkAppendLine(sc);
15524 if (res != VKFFT_SUCCESS) return res;
15525 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim);
15526 res = VkAppendLine(sc);
15527 if (res != VKFFT_SUCCESS) return res;
15528 }
15529 else {
15530 sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->fftDim, sc->fftDim);
15531 res = VkAppendLine(sc);
15532 if (res != VKFFT_SUCCESS) return res;
15533 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim);
15534 res = VkAppendLine(sc);
15535 if (res != VKFFT_SUCCESS) return res;
15536 }
15537 sc->tempLen = sprintf(sc->tempStr, " }else{\n");
15538 res = VkAppendLine(sc);
15539 if (res != VKFFT_SUCCESS) return res;
15540 if (sc->axisSwapped) {
15541 sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->fftDim, sc->fftDim, sc->fftDim);
15542 res = VkAppendLine(sc);
15543 if (res != VKFFT_SUCCESS) return res;
15544 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 "))* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim);
15545 res = VkAppendLine(sc);
15546 if (res != VKFFT_SUCCESS) return res;
15547 }
15548 else {
15549 sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->fftDim, sc->fftDim, sc->fftDim);
15550 res = VkAppendLine(sc);
15551 if (res != VKFFT_SUCCESS) return res;
15552 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim);
15553 res = VkAppendLine(sc);
15554 if (res != VKFFT_SUCCESS) return res;
15555 }
15556 sc->tempLen = sprintf(sc->tempStr, " }\n");
15557 res = VkAppendLine(sc);
15558 if (res != VKFFT_SUCCESS) return res;
15559
15560 }
15561 }*/
15562
15563 }
15564 else {
15565
15566 }
15567 }
15568 break;
15569 }
15570 case 145://odd DCT-IV strided as N FFT
15571 {
15572 if (!sc->writeFromRegisters) {
15573 res = appendBarrierVkFFT(sc, 1);
15574 if (res != VKFFT_SUCCESS) return res;
15575 }
15576 //res = appendZeropadStart(sc);
15577 //if (res != VKFFT_SUCCESS) return res;
15578 char shiftX[500] = "";
15579 if (sc->performWorkGroupShift[0])
15580 sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x);
15581 char shiftY[500] = "";
15582 if (sc->performWorkGroupShift[1])
15583 sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
15584 char shiftY2[500] = "";
15585 if (sc->performWorkGroupShift[1])
15586 sprintf(shiftY2, " + consts.workGroupShiftY ");
15587 uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
15588 if (sc->reorderFourStep) {
15589 //Not implemented
15590 }
15591 else {
15592 //appendBarrierVkFFT(sc, 1);
15593 //appendZeropadStart(sc);
15594 if (sc->fftDim == sc->fft_dim_full) {
15596 for (uint64_t k = 0; k < sc->registerBoost; k++) {
15597 for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
15598 sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
15599 res = VkAppendLine(sc);
15600 if (res != VKFFT_SUCCESS) return res;
15601
15602 sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]);
15603 res = VkAppendLine(sc);
15604 if (res != VKFFT_SUCCESS) return res;
15605 if (sc->size[0] % sc->localSize[0] != 0) {
15606 sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]);
15607 res = VkAppendLine(sc);
15608 if (res != VKFFT_SUCCESS) return res;
15609 }
15610 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= sc->fftDim * sc->localSize[0]) {
15611 sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
15612 res = VkAppendLine(sc);
15613 if (res != VKFFT_SUCCESS) return res;
15614 }
15615 if (sc->zeropad[1]) {
15616 sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
15617 res = VkAppendLine(sc);
15618 if (res != VKFFT_SUCCESS) return res;
15619 }
15620 sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID);
15621 res = VkAppendLine(sc);
15622 if (res != VKFFT_SUCCESS) return res;
15623 res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch);
15624 sc->tempLen = sprintf(sc->tempStr, ";\n");
15625 res = VkAppendLine(sc);
15626 if (res != VKFFT_SUCCESS) return res;
15628 if (res != VKFFT_SUCCESS) return res;
15629 sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID / %" PRIu64 ";\n", sc->localSize[0]);
15630 res = VkAppendLine(sc);
15631 if (res != VKFFT_SUCCESS) return res;
15632 sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim / 4);
15633 res = VkAppendLine(sc);
15634 if (res != VKFFT_SUCCESS) return res;
15635 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_x);
15636 res = VkAppendLine(sc);
15637 if (res != VKFFT_SUCCESS) return res;
15638 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\
15639 %s.x = -%s.x;\n\
15640 else\n\
15641 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15642 res = VkAppendLine(sc);
15643 if (res != VKFFT_SUCCESS) return res;
15644 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15645 %s.x += %s.y;\n\
15646 else\n\
15647 %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15648 res = VkAppendLine(sc);
15649 if (res != VKFFT_SUCCESS) return res;
15650 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15651
15652
15653 res = VkAppendLine(sc);
15654 if (res != VKFFT_SUCCESS) return res;
15655 sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim / 2, sc->fftDim / 4);
15656 res = VkAppendLine(sc);
15657 if (res != VKFFT_SUCCESS) return res;
15658 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x);
15659 res = VkAppendLine(sc);
15660 if (res != VKFFT_SUCCESS) return res;
15661 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15662 %s.x = -%s.x;\n\
15663 else\n\
15664 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15665 res = VkAppendLine(sc);
15666 if (res != VKFFT_SUCCESS) return res;
15667 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15668 %s.x -= %s.y;\n\
15669 else\n\
15670 %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15671 res = VkAppendLine(sc);
15672 if (res != VKFFT_SUCCESS) return res;
15673 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15674 res = VkAppendLine(sc);
15675 if (res != VKFFT_SUCCESS) return res;
15676
15677
15678 sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2);
15679 res = VkAppendLine(sc);
15680 if (res != VKFFT_SUCCESS) return res;
15681 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x);
15682 res = VkAppendLine(sc);
15683 if (res != VKFFT_SUCCESS) return res;
15684 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15685 %s.x = -%s.x;\n\
15686 else\n\
15687 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15688 res = VkAppendLine(sc);
15689 if (res != VKFFT_SUCCESS) return res;
15690 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15691 %s.x += %s.y;\n\
15692 else\n\
15693 %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15694 res = VkAppendLine(sc);
15695 if (res != VKFFT_SUCCESS) return res;
15696 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15697 res = VkAppendLine(sc);
15698 if (res != VKFFT_SUCCESS) return res;
15699
15700
15701 sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4);
15702 res = VkAppendLine(sc);
15703 if (res != VKFFT_SUCCESS) return res;
15704 sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_x);
15705 res = VkAppendLine(sc);
15706 if (res != VKFFT_SUCCESS) return res;
15707 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\
15708 %s.x = -%s.x;\n\
15709 else\n\
15710 %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15711 res = VkAppendLine(sc);
15712 if (res != VKFFT_SUCCESS) return res;
15713 sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\
15714 %s.x -= %s.y;\n\
15715 else\n\
15716 %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]);
15717 res = VkAppendLine(sc);
15718 if (res != VKFFT_SUCCESS) return res;
15719 sc->tempLen = sprintf(sc->tempStr, " }\n\n");
15720 res = VkAppendLine(sc);
15721 if (res != VKFFT_SUCCESS) return res;
15722 sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending);
15723 res = VkAppendLine(sc);
15724 if (res != VKFFT_SUCCESS) return res;
15725 if (sc->outputBufferBlockNum == 1)
15726 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight);
15727 else
15728 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight);
15729 res = VkAppendLine(sc);
15730 if (res != VKFFT_SUCCESS) return res;
15731 if (sc->zeropad[1]) {
15732 sc->tempLen = sprintf(sc->tempStr, " }\n");
15733 res = VkAppendLine(sc);
15734 if (res != VKFFT_SUCCESS) return res;
15735 }
15736
15738 if (res != VKFFT_SUCCESS) return res;
15739 if ((1 + i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1] >= sc->fftDim * sc->localSize[0]) {
15740 sc->tempLen = sprintf(sc->tempStr, " }\n");
15741 res = VkAppendLine(sc);
15742 if (res != VKFFT_SUCCESS) return res;
15743 }
15744 if (sc->size[0] % sc->localSize[0] != 0) {
15745 sc->tempLen = sprintf(sc->tempStr, " }\n");
15746 res = VkAppendLine(sc);
15747 if (res != VKFFT_SUCCESS) return res;
15748 }
15749 }
15750 }
15751 if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full;
15752 }
15753 else {
15754
15755 }
15756 }
15757 break;
15758 }
15759 }
15760 //res = appendZeropadEnd(sc);
15761 //if (res != VKFFT_SUCCESS) return res;
15762 return res;
15763}
15764static inline VkFFTResult shaderGenVkFFT_R2C_decomposition(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) {
15766 //appendLicense(output);
15767 sc->output = output;
15768 sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength);
15769 if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED;
15770 sc->tempLen = 0;
15771 sc->currentLen = 0;
15772 char vecType[30];
15773 char vecTypeInput[30];
15774 char vecTypeOutput[30];
15775 char inputsStruct[20] = "";
15776 char outputsStruct[20] = "";
15777 char LFending[4] = "";
15778 if (!strcmp(floatType, "float")) sprintf(LFending, "f");
15779#if(VKFFT_BACKEND==0)
15780 if (sc->inputBufferBlockNum == 1)
15781 sprintf(inputsStruct, "inputs");
15782 else
15783 sprintf(inputsStruct, ".inputs");
15784 if (sc->outputBufferBlockNum == 1)
15785 sprintf(outputsStruct, "outputs");
15786 else
15787 sprintf(outputsStruct, ".outputs");
15788 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
15789 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
15790 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
15791 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
15792 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2");
15793 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2");
15794 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
15795 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2");
15796 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2");
15797 sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x");
15798 sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y");
15799 sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z");
15800 sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x");
15801 sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y");
15802 sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z");
15803 sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x");
15804 sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y");
15805 sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z");
15806 sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x");
15807 sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y");
15808 sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z");
15809 if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
15810 char cosDef[20] = "cos";
15811 char sinDef[20] = "sin";
15812#elif(VKFFT_BACKEND==1)
15813 sprintf(inputsStruct, "inputs");
15814 sprintf(outputsStruct, "outputs");
15815 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
15816 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
15817 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
15818 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
15819 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
15820 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
15821 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
15822 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
15823 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
15824 sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x");
15825 sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y");
15826 sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z");
15827 sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)");
15828 sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)");
15829 sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)");
15830 sprintf(sc->gl_WorkGroupID_x, "blockIdx.x");
15831 sprintf(sc->gl_WorkGroupID_y, "blockIdx.y");
15832 sprintf(sc->gl_WorkGroupID_z, "blockIdx.z");
15833 sprintf(sc->gl_WorkGroupSize_x, "blockDim.x");
15834 sprintf(sc->gl_WorkGroupSize_y, "blockDim.y");
15835 sprintf(sc->gl_WorkGroupSize_z, "blockDim.z");
15836 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
15837 char cosDef[20] = "__cosf";
15838 char sinDef[20] = "__sinf";
15839#elif(VKFFT_BACKEND==2)
15840 sprintf(inputsStruct, "inputs");
15841 sprintf(outputsStruct, "outputs");
15842 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
15843 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
15844 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
15845 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
15846 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
15847 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
15848 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
15849 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
15850 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
15851 sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x");
15852 sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y");
15853 sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z");
15854 sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)");
15855 sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)");
15856 sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)");
15857 sprintf(sc->gl_WorkGroupID_x, "blockIdx.x");
15858 sprintf(sc->gl_WorkGroupID_y, "blockIdx.y");
15859 sprintf(sc->gl_WorkGroupID_z, "blockIdx.z");
15860 sprintf(sc->gl_WorkGroupSize_x, "blockDim.x");
15861 sprintf(sc->gl_WorkGroupSize_y, "blockDim.y");
15862 sprintf(sc->gl_WorkGroupSize_z, "blockDim.z");
15863 if (!strcmp(floatType, "double")) sprintf(LFending, "l");
15864 char cosDef[20] = "__cosf";
15865 char sinDef[20] = "__sinf";
15866#elif(VKFFT_BACKEND==3)
15867 sprintf(inputsStruct, "inputs");
15868 sprintf(outputsStruct, "outputs");
15869 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
15870 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
15871 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
15872 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
15873 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
15874 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
15875 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
15876 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
15877 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
15878 sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)");
15879 sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)");
15880 sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)");
15881 sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)");
15882 sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)");
15883 sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)");
15884 sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)");
15885 sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)");
15886 sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)");
15887 sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)");
15888 sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)");
15889 sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)");
15890 //if (!strcmp(floatType, "double")) sprintf(LFending, "l");
15891 char cosDef[20] = "native_cos";
15892 char sinDef[20] = "native_sin";
15893#endif
15894 sprintf(sc->stageInvocationID, "stageInvocationID");
15895 sprintf(sc->blockInvocationID, "blockInvocationID");
15896 sprintf(sc->tshuffle, "tshuffle");
15897 sprintf(sc->sharedStride, "sharedStride");
15898 sprintf(sc->combinedID, "combinedID");
15899 sprintf(sc->inoutID, "inoutID");
15900 sprintf(sc->sdataID, "sdataID");
15901
15902 char convTypeLeftInput[20] = "";
15903 char convTypeRightInput[20] = "";
15904 if ((!strcmp(floatType, "float")) && (strcmp(floatTypeInputMemory, "float"))) {
15905#if(VKFFT_BACKEND==0)
15906 sprintf(convTypeLeftInput, "vec2(");
15907 sprintf(convTypeRightInput, ")");
15908#elif(VKFFT_BACKEND==1)
15909 sprintf(convTypeLeftInput, "conv_float2(");
15910 sprintf(convTypeRightInput, ")");
15911#elif(VKFFT_BACKEND==2)
15912 sprintf(convTypeLeftInput, "conv_float2(");
15913 sprintf(convTypeRightInput, ")");
15914#elif(VKFFT_BACKEND==3)
15915 sprintf(convTypeLeftInput, "conv_float2(");
15916 sprintf(convTypeRightInput, ")");
15917#endif
15918 }
15919 if ((!strcmp(floatType, "double")) && (strcmp(floatTypeInputMemory, "double"))) {
15920#if(VKFFT_BACKEND==0)
15921 sprintf(convTypeLeftInput, "dvec2(");
15922 sprintf(convTypeRightInput, ")");
15923#elif(VKFFT_BACKEND==1)
15924 sprintf(convTypeLeftInput, "conv_double2(");
15925 sprintf(convTypeRightInput, ")");
15926#elif(VKFFT_BACKEND==2)
15927 sprintf(convTypeLeftInput, "conv_double2(");
15928 sprintf(convTypeRightInput, ")");
15929#elif(VKFFT_BACKEND==3)
15930 sprintf(convTypeLeftInput, "conv_double2(");
15931 sprintf(convTypeRightInput, ")");
15932#endif
15933 }
15934
15935 char convTypeLeftOutput[20] = "";
15936 char convTypeRightOutput[20] = "";
15937 if ((!strcmp(floatTypeOutputMemory, "half")) && (strcmp(floatType, "half"))) {
15938 sprintf(convTypeLeftOutput, "f16vec2(");
15939 sprintf(convTypeRightOutput, ")");
15940 }
15941 if ((!strcmp(floatTypeOutputMemory, "float")) && (strcmp(floatType, "float"))) {
15942#if(VKFFT_BACKEND==0)
15943 sprintf(convTypeLeftOutput, "vec2(");
15944 sprintf(convTypeRightOutput, ")");
15945#elif(VKFFT_BACKEND==1)
15946 sprintf(convTypeLeftOutput, "(float2)");
15947#elif(VKFFT_BACKEND==2)
15948 sprintf(convTypeLeftOutput, "(float2)");
15949#elif(VKFFT_BACKEND==3)
15950 sprintf(convTypeLeftOutput, "conv_float2(");
15951 sprintf(convTypeRightOutput, ")");
15952#endif
15953 }
15954 if ((!strcmp(floatTypeOutputMemory, "double")) && (strcmp(floatType, "double"))) {
15955#if(VKFFT_BACKEND==0)
15956 sprintf(convTypeLeftOutput, "dvec2(");
15957 sprintf(convTypeRightOutput, ")");
15958#elif(VKFFT_BACKEND==1)
15959 sprintf(convTypeLeftOutput, "(double2)");
15960#elif(VKFFT_BACKEND==2)
15961 sprintf(convTypeLeftOutput, "(double2)");
15962#elif(VKFFT_BACKEND==3)
15963 sprintf(convTypeLeftOutput, "conv_double2(");
15964 sprintf(convTypeRightOutput, ")");
15965#endif
15966 }
15967 //sprintf(sc->tempReg, "temp");
15968 res = appendVersion(sc);
15969 if (res != VKFFT_SUCCESS) return res;
15970 res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
15971 if (res != VKFFT_SUCCESS) return res;
15972 res = appendLayoutVkFFT(sc);
15973 if (res != VKFFT_SUCCESS) return res;
15974 res = appendConstantsVkFFT(sc, floatType, uintType);
15975 if (res != VKFFT_SUCCESS) return res;
15976 if ((!sc->LUT) && (!strcmp(floatType, "double"))) {
15977 res = appendSinCos20(sc, floatType, uintType);
15978 if (res != VKFFT_SUCCESS) return res;
15979 }
15980 if (strcmp(floatType, floatTypeInputMemory)) {
15981 res = appendConversion(sc, floatType, floatTypeInputMemory);
15982 if (res != VKFFT_SUCCESS) return res;
15983 }
15984 if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
15985 res = appendConversion(sc, floatType, floatTypeOutputMemory);
15986 if (res != VKFFT_SUCCESS) return res;
15987 }
15988 res = appendPushConstantsVkFFT(sc, floatType, uintType);
15989 if (res != VKFFT_SUCCESS) return res;
15990 uint64_t id = 0;
15991 res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, 0);
15992 if (res != VKFFT_SUCCESS) return res;
15993 id++;
15994 res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, 0);
15995 if (res != VKFFT_SUCCESS) return res;
15996 id++;
15997 if (sc->convolutionStep) {
15998 res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory);
15999 if (res != VKFFT_SUCCESS) return res;
16000 id++;
16001 }
16002 if (sc->LUT) {
16003 res = appendLUTLayoutVkFFT(sc, id, floatType);
16004 if (res != VKFFT_SUCCESS) return res;
16005 id++;
16006 }
16007 //appendIndexInputVkFFT(sc, uintType, type);
16008 //appendIndexOutputVkFFT(sc, uintType, type);
16009 /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 };
16010 for (uint64_t i = 0; i < sc->numStages; i++) {
16011 if (appendedRadix[sc->stageRadix[i]] == 0) {
16012 appendedRadix[sc->stageRadix[i]] = 1;
16013 appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]);
16014 }
16015 }*/
16016#if(VKFFT_BACKEND==0)
16017 sc->tempLen = sprintf(sc->tempStr, "void main() {\n");
16018 res = VkAppendLine(sc);
16019 if (res != VKFFT_SUCCESS) return res;
16020#elif(VKFFT_BACKEND==1)
16021 sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ __launch_bounds__(%" PRIu64 ") void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
16022 res = VkAppendLine(sc);
16023 if (res != VKFFT_SUCCESS) return res;
16024 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16025 res = VkAppendLine(sc);
16026 if (res != VKFFT_SUCCESS) return res;
16027 if (sc->convolutionStep) {
16028 sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType);
16029 res = VkAppendLine(sc);
16030 if (res != VKFFT_SUCCESS) return res;
16031 }
16032 if (sc->LUT) {
16033 sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType);
16034 res = VkAppendLine(sc);
16035 if (res != VKFFT_SUCCESS) return res;
16036 }
16037 sc->tempLen = sprintf(sc->tempStr, ") {\n");
16038 res = VkAppendLine(sc);
16039 if (res != VKFFT_SUCCESS) return res;
16040 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
16041#elif(VKFFT_BACKEND==2)
16042 sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
16043 res = VkAppendLine(sc);
16044 if (res != VKFFT_SUCCESS) return res;
16045 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16046 res = VkAppendLine(sc);
16047 if (res != VKFFT_SUCCESS) return res;
16048 if (sc->convolutionStep) {
16049 sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType);
16050 res = VkAppendLine(sc);
16051 if (res != VKFFT_SUCCESS) return res;
16052 }
16053 if (sc->LUT) {
16054 sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType);
16055 res = VkAppendLine(sc);
16056 if (res != VKFFT_SUCCESS) return res;
16057 }
16058 sc->tempLen = sprintf(sc->tempStr, ") {\n");
16059 res = VkAppendLine(sc);
16060 if (res != VKFFT_SUCCESS) return res;
16061 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
16062#elif(VKFFT_BACKEND==3)
16063 sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main_R2C ", sc->localSize[0], sc->localSize[1], sc->localSize[2]);
16064 res = VkAppendLine(sc);
16065 if (res != VKFFT_SUCCESS) return res;
16066 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
16067 res = VkAppendLine(sc);
16068 if (res != VKFFT_SUCCESS) return res;
16069 if (sc->convolutionStep) {
16070 sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType);
16071 res = VkAppendLine(sc);
16072 if (res != VKFFT_SUCCESS) return res;
16073 }
16074 if (sc->LUT) {
16075 sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType);
16076 res = VkAppendLine(sc);
16077 if (res != VKFFT_SUCCESS) return res;
16078 }
16079 sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts");
16080 res = VkAppendLine(sc);
16081 if (res != VKFFT_SUCCESS) return res;
16082 sc->tempLen = sprintf(sc->tempStr, ") {\n");
16083 res = VkAppendLine(sc);
16084 if (res != VKFFT_SUCCESS) return res;
16085 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
16086#endif
16087 char index_x[2000] = "";
16088 char idX[500] = "";
16089 if (sc->performWorkGroupShift[0])
16090 sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
16091 else
16092 sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
16093 res = appendZeropadStart(sc);
16094 if (res != VKFFT_SUCCESS) return res;
16095 sc->tempLen = sprintf(sc->tempStr, "%s id_x = %s %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0));
16096 res = VkAppendLine(sc);
16097 if (res != VKFFT_SUCCESS) return res;
16098 sc->tempLen = sprintf(sc->tempStr, "%s id_y = (%s / %" PRIu64 ") %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]);
16099 res = VkAppendLine(sc);
16100 if (res != VKFFT_SUCCESS) return res;
16101 sc->tempLen = sprintf(sc->tempStr, "%s id_z = (%s / %" PRIu64 ") / %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]);
16102 res = VkAppendLine(sc);
16103 if (res != VKFFT_SUCCESS) return res;
16104 sc->tempLen = sprintf(sc->tempStr, "if (%s < %" PRIu64 "){\n", idX, (uint64_t)ceil(sc->size[0] / 4.0) * sc->size[1] * sc->size[2]);
16105 res = VkAppendLine(sc);
16106 if (res != VKFFT_SUCCESS) return res;
16107
16108 sc->tempLen = sprintf(sc->tempStr, "%s inoutID = ", uintType);
16109 res = VkAppendLine(sc);
16110 if (res != VKFFT_SUCCESS) return res;
16111 sprintf(index_x, "id_x + id_y*%" PRIu64 " +id_z*%" PRIu64 "", sc->inputStride[1], sc->inputStride[2]);
16112 res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0);
16113 if (res != VKFFT_SUCCESS) return res;
16114 sc->tempLen = sprintf(sc->tempStr, ";\n");
16115 res = VkAppendLine(sc);
16116 if (res != VKFFT_SUCCESS) return res;
16117
16118 sc->tempLen = sprintf(sc->tempStr, "%s inoutID2;\n", uintType);
16119 res = VkAppendLine(sc);
16120 if (res != VKFFT_SUCCESS) return res;
16121 sc->tempLen = sprintf(sc->tempStr, "%s inoutID3;\n", uintType);
16122 res = VkAppendLine(sc);
16123 if (res != VKFFT_SUCCESS) return res;
16124 if (sc->inputBufferBlockNum == 1)
16125 sc->tempLen = sprintf(sc->tempStr, " %s t0 = %s%s[inoutID]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput);
16126 else
16127 sc->tempLen = sprintf(sc->tempStr, " %s t0 = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput);
16128 res = VkAppendLine(sc);
16129 if (res != VKFFT_SUCCESS) return res;
16130 sc->tempLen = sprintf(sc->tempStr, " %s tf;\n", vecType);
16131 res = VkAppendLine(sc);
16132 if (res != VKFFT_SUCCESS) return res;
16133 if (sc->size[0] % 4 == 0) {
16134 sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n");
16135 res = VkAppendLine(sc);
16136 if (res != VKFFT_SUCCESS) return res;
16137
16138 sc->tempLen = sprintf(sc->tempStr, " inoutID2 = ");
16139 res = VkAppendLine(sc);
16140 if (res != VKFFT_SUCCESS) return res;
16141 sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]);
16142 res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0);
16143 if (res != VKFFT_SUCCESS) return res;
16144 sc->tempLen = sprintf(sc->tempStr, ";\n");
16145 res = VkAppendLine(sc);
16146 if (res != VKFFT_SUCCESS) return res;
16147
16148 sc->tempLen = sprintf(sc->tempStr, " inoutID3 = ");
16149 res = VkAppendLine(sc);
16150 if (res != VKFFT_SUCCESS) return res;
16151 sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (uint64_t)ceil(sc->size[0] / 4.0), sc->inputStride[1], sc->inputStride[2]);
16152 res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0);
16153 if (res != VKFFT_SUCCESS) return res;
16154 sc->tempLen = sprintf(sc->tempStr, ";\n");
16155 res = VkAppendLine(sc);
16156 if (res != VKFFT_SUCCESS) return res;
16157 if (sc->inputBufferBlockNum == 1)
16158 sc->tempLen = sprintf(sc->tempStr, " tf = %s%s[inoutID3]%s;\n", convTypeLeftInput, inputsStruct, convTypeRightInput);
16159 else
16160 sc->tempLen = sprintf(sc->tempStr, " tf = %sinputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "]%s;\n", convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput);
16161 res = VkAppendLine(sc);
16162 if (res != VKFFT_SUCCESS) return res;
16163
16164 sc->tempLen = sprintf(sc->tempStr, "} else {\n");
16165 res = VkAppendLine(sc);
16166 if (res != VKFFT_SUCCESS) return res;
16167
16168 sc->tempLen = sprintf(sc->tempStr, " inoutID2 = ");
16169 res = VkAppendLine(sc);
16170 if (res != VKFFT_SUCCESS) return res;
16171 sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]);
16172 res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0);
16173 if (res != VKFFT_SUCCESS) return res;
16174 sc->tempLen = sprintf(sc->tempStr, ";\n");
16175
16176 res = VkAppendLine(sc);
16177 if (res != VKFFT_SUCCESS) return res;
16178 sc->tempLen = sprintf(sc->tempStr, "}");
16179 res = VkAppendLine(sc);
16180 if (res != VKFFT_SUCCESS) return res;
16181 }
16182 else {
16183 sc->tempLen = sprintf(sc->tempStr, "inoutID2 = ");
16184 res = VkAppendLine(sc);
16185 if (res != VKFFT_SUCCESS) return res;
16186 sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]);
16187 res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0);
16188 if (res != VKFFT_SUCCESS) return res;
16189 sc->tempLen = sprintf(sc->tempStr, ";\n");
16190 res = VkAppendLine(sc);
16191 if (res != VKFFT_SUCCESS) return res;
16192 }
16193 if (sc->inputBufferBlockNum == 1)
16194 sc->tempLen = sprintf(sc->tempStr, " %s t1 = %s%s[inoutID2]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput);
16195 else
16196 sc->tempLen = sprintf(sc->tempStr, " %s t1 = %sinputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput);
16197 res = VkAppendLine(sc);
16198 if (res != VKFFT_SUCCESS) return res;
16199
16200 sc->tempLen = sprintf(sc->tempStr, " %s t2;\n", vecType);
16201 res = VkAppendLine(sc);
16202 if (res != VKFFT_SUCCESS) return res;
16203 sc->tempLen = sprintf(sc->tempStr, " %s t3;\n", vecType);
16204 res = VkAppendLine(sc);
16205 if (res != VKFFT_SUCCESS) return res;
16206 sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n");
16207 res = VkAppendLine(sc);
16208 if (res != VKFFT_SUCCESS) return res;
16209 if (sc->size[0] % 4 == 0) {
16210 if (!sc->inverse) {
16211 sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n");
16212 res = VkAppendLine(sc);
16213 if (res != VKFFT_SUCCESS) return res;
16214 sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n");
16215 res = VkAppendLine(sc);
16216 if (res != VKFFT_SUCCESS) return res;
16217 sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n");
16218 res = VkAppendLine(sc);
16219 if (res != VKFFT_SUCCESS) return res;
16220 sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n");
16221 res = VkAppendLine(sc);
16222 if (res != VKFFT_SUCCESS) return res;
16223 }
16224 else {
16225 sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n");
16226 res = VkAppendLine(sc);
16227 if (res != VKFFT_SUCCESS) return res;
16228 sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n");
16229 res = VkAppendLine(sc);
16230 if (res != VKFFT_SUCCESS) return res;
16231 }
16232 sc->tempLen = sprintf(sc->tempStr, " tf.y = -tf.y;\n");
16233 res = VkAppendLine(sc);
16234 if (res != VKFFT_SUCCESS) return res;
16235 if (sc->inverse) {
16236 res = VkMulComplexNumber(sc, "tf", "tf", "2");
16237 if (res != VKFFT_SUCCESS) return res;
16238 }
16239 if (sc->outputBufferBlockNum == 1)
16240 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16241 else
16242 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16243 res = VkAppendLine(sc);
16244 if (res != VKFFT_SUCCESS) return res;
16245 if (!sc->inverse) {
16246 if (sc->outputBufferBlockNum == 1)
16247 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16248 else
16249 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16250 res = VkAppendLine(sc);
16251 if (res != VKFFT_SUCCESS) return res;
16252 }
16253 if (sc->outputBufferBlockNum == 1)
16254 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID3] = %stf%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16255 else
16256 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "] = %stf%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16257 res = VkAppendLine(sc);
16258 if (res != VKFFT_SUCCESS) return res;
16259
16260 }
16261 else {
16262 if (!sc->inverse) {
16263 sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n");
16264 res = VkAppendLine(sc);
16265 if (res != VKFFT_SUCCESS) return res;
16266 sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n");
16267 res = VkAppendLine(sc);
16268 if (res != VKFFT_SUCCESS) return res;
16269 sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n");
16270 res = VkAppendLine(sc);
16271 if (res != VKFFT_SUCCESS) return res;
16272 sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n");
16273 res = VkAppendLine(sc);
16274 if (res != VKFFT_SUCCESS) return res;
16275 }
16276 else {
16277 sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n");
16278 res = VkAppendLine(sc);
16279 if (res != VKFFT_SUCCESS) return res;
16280 sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n");
16281 res = VkAppendLine(sc);
16282 if (res != VKFFT_SUCCESS) return res;
16283 }
16284 if (sc->outputBufferBlockNum == 1)
16285 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16286 else
16287 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16288 res = VkAppendLine(sc);
16289 if (res != VKFFT_SUCCESS) return res;
16290 if (!sc->inverse) {
16291 if (sc->outputBufferBlockNum == 1)
16292 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16293 else
16294 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16295 res = VkAppendLine(sc);
16296 if (res != VKFFT_SUCCESS) return res;
16297 }
16298 }
16299 sc->tempLen = sprintf(sc->tempStr, "} else {\n");
16300 res = VkAppendLine(sc);
16301 if (res != VKFFT_SUCCESS) return res;
16302 res = VkAddComplex(sc, "t2", "t0", "t1");
16303 if (res != VKFFT_SUCCESS) return res;
16304 res = VkSubComplex(sc, "t3", "t0", "t1");
16305 if (res != VKFFT_SUCCESS) return res;
16306 if (!sc->inverse) {
16307 res = VkMulComplexNumber(sc, "t2", "t2", "0.5");
16308 if (res != VKFFT_SUCCESS) return res;
16309 res = VkMulComplexNumber(sc, "t3", "t3", "0.5");
16310 if (res != VKFFT_SUCCESS) return res;
16311 }
16312 if (sc->LUT) {
16313 sc->tempLen = sprintf(sc->tempStr, " tf = twiddleLUT[id_x];\n");
16314 res = VkAppendLine(sc);
16315 if (res != VKFFT_SUCCESS) return res;
16316 }
16317 else {
16318 sc->tempLen = sprintf(sc->tempStr, " %s angle = (loc_PI*id_x)/%" PRIu64 ";\n", floatType, sc->size[0] / 2);
16319 res = VkAppendLine(sc);
16320 if (res != VKFFT_SUCCESS) return res;
16321 if (!strcmp(floatType, "float")) {
16322 sc->tempLen = sprintf(sc->tempStr, " tf.x = %s(angle);\n", cosDef);
16323 res = VkAppendLine(sc);
16324 if (res != VKFFT_SUCCESS) return res;
16325 sc->tempLen = sprintf(sc->tempStr, " tf.y = %s(angle);\n", sinDef);
16326 res = VkAppendLine(sc);
16327 if (res != VKFFT_SUCCESS) return res;
16328 }
16329 if (!strcmp(floatType, "double")) {
16330 sc->tempLen = sprintf(sc->tempStr, " tf = sincos_20(angle);\n");
16331 res = VkAppendLine(sc);
16332 if (res != VKFFT_SUCCESS) return res;
16333 }
16334 }
16335 if (!sc->inverse) {
16336 sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y-tf.y*t3.x;\n");
16337 res = VkAppendLine(sc);
16338 if (res != VKFFT_SUCCESS) return res;
16339 sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y-tf.x*t3.x;\n");
16340 res = VkAppendLine(sc);
16341 if (res != VKFFT_SUCCESS) return res;
16342 sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-t0.x;\n");
16343 res = VkAppendLine(sc);
16344 if (res != VKFFT_SUCCESS) return res;
16345 sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n");
16346 res = VkAppendLine(sc);
16347 if (res != VKFFT_SUCCESS) return res;
16348 sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+t0.x;\n");
16349 res = VkAppendLine(sc);
16350 if (res != VKFFT_SUCCESS) return res;
16351 sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n");
16352 res = VkAppendLine(sc);
16353 if (res != VKFFT_SUCCESS) return res;
16354 }
16355 else {
16356 sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y+tf.y*t3.x;\n");
16357 res = VkAppendLine(sc);
16358 if (res != VKFFT_SUCCESS) return res;
16359 sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y+tf.x*t3.x;\n");
16360 res = VkAppendLine(sc);
16361 if (res != VKFFT_SUCCESS) return res;
16362 sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x+t0.x;\n");
16363 res = VkAppendLine(sc);
16364 if (res != VKFFT_SUCCESS) return res;
16365 sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n");
16366 res = VkAppendLine(sc);
16367 if (res != VKFFT_SUCCESS) return res;
16368 sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x-t0.x;\n");
16369 res = VkAppendLine(sc);
16370 if (res != VKFFT_SUCCESS) return res;
16371 sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n");
16372 res = VkAppendLine(sc);
16373 if (res != VKFFT_SUCCESS) return res;
16374 }
16375 //sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+tf.x*t2.y-tf.y*t3.x;\n");
16376 //sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y-tf.y*t2.y-tf.x*t3.x;\n");
16377 //sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-tf.x*t2.y+tf.y*t3.x;\n");
16378 //sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y-tf.y*t2.y-tf.x*t3.x;\n");
16379
16380 if (sc->outputBufferBlockNum == 1)
16381 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st0%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16382 else
16383 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st0%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16384 res = VkAppendLine(sc);
16385 if (res != VKFFT_SUCCESS) return res;
16386
16387 if (sc->outputBufferBlockNum == 1)
16388 sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st1%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16389 else
16390 sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st1%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput);
16391 res = VkAppendLine(sc);
16392 if (res != VKFFT_SUCCESS) return res;
16393
16394 sc->tempLen = sprintf(sc->tempStr, "}\n");
16395 res = VkAppendLine(sc);
16396 if (res != VKFFT_SUCCESS) return res;
16397 sc->tempLen = sprintf(sc->tempStr, "}\n");
16398 res = VkAppendLine(sc);
16399 if (res != VKFFT_SUCCESS) return res;
16400 res = appendZeropadEnd(sc);
16401 if (res != VKFFT_SUCCESS) return res;
16402 sc->tempLen = sprintf(sc->tempStr, "}\n");
16403 res = VkAppendLine(sc);
16404 if (res != VKFFT_SUCCESS) return res;
16405
16406 //printf("%s", output);
16407 return res;
16408}
16410 if (sc->tempStr) {
16411 free(sc->tempStr);
16412 sc->tempStr = 0;
16413 }
16414 if (sc->disableThreadsStart) {
16415 free(sc->disableThreadsStart);
16416 sc->disableThreadsStart = 0;
16417 }
16418 if (sc->disableThreadsStart) {
16419 free(sc->disableThreadsEnd);
16420 sc->disableThreadsEnd = 0;
16421 }
16422 if (sc->regIDs) {
16423 for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
16424 if (sc->regIDs[i]) {
16425 free(sc->regIDs[i]);
16426 sc->regIDs[i] = 0;
16427 }
16428 }
16429 free(sc->regIDs);
16430 sc->regIDs = 0;
16431 }
16432}
16433static inline VkFFTResult shaderGenVkFFT(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) {
16435 //appendLicense(output);
16436 sc->output = output;
16437 sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength);
16438 if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED;
16439 sc->tempLen = 0;
16440 sc->currentLen = 0;
16441 char vecType[30];
16442 char vecTypeInput[30];
16443 char vecTypeOutput[30];
16444#if(VKFFT_BACKEND==0)
16445 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
16446 if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
16447 if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
16448 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
16449 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2");
16450 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2");
16451 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
16452 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2");
16453 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2");
16454 sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x");
16455 sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y");
16456 sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z");
16457 sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x");
16458 sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y");
16459 sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z");
16460 sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x");
16461 sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y");
16462 sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z");
16463 sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x");
16464 sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y");
16465 sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z");
16466#elif(VKFFT_BACKEND==1)
16467 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
16468 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
16469 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
16470 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
16471 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
16472 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
16473 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
16474 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
16475 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
16476 sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x");
16477 sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y");
16478 sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z");
16479 sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)");
16480 sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)");
16481 sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)");
16482 sprintf(sc->gl_WorkGroupID_x, "blockIdx.x");
16483 sprintf(sc->gl_WorkGroupID_y, "blockIdx.y");
16484 sprintf(sc->gl_WorkGroupID_z, "blockIdx.z");
16485 sprintf(sc->gl_WorkGroupSize_x, "blockDim.x");
16486 sprintf(sc->gl_WorkGroupSize_y, "blockDim.y");
16487 sprintf(sc->gl_WorkGroupSize_z, "blockDim.z");
16488#elif(VKFFT_BACKEND==2)
16489 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
16490 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
16491 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
16492 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
16493 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
16494 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
16495 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
16496 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
16497 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
16498 sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x");
16499 sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y");
16500 sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z");
16501 sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)");
16502 sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)");
16503 sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)");
16504 sprintf(sc->gl_WorkGroupID_x, "blockIdx.x");
16505 sprintf(sc->gl_WorkGroupID_y, "blockIdx.y");
16506 sprintf(sc->gl_WorkGroupID_z, "blockIdx.z");
16507 sprintf(sc->gl_WorkGroupSize_x, "blockDim.x");
16508 sprintf(sc->gl_WorkGroupSize_y, "blockDim.y");
16509 sprintf(sc->gl_WorkGroupSize_z, "blockDim.z");
16510#elif(VKFFT_BACKEND==3)
16511 if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
16512 if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
16513 if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
16514 if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2");
16515 if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2");
16516 if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2");
16517 if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2");
16518 if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2");
16519 if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2");
16520 sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)");
16521 sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)");
16522 sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)");
16523 sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)");
16524 sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)");
16525 sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)");
16526 sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)");
16527 sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)");
16528 sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)");
16529 sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)");
16530 sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)");
16531 sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)");
16532#endif
16533 sprintf(sc->stageInvocationID, "stageInvocationID");
16534 sprintf(sc->blockInvocationID, "blockInvocationID");
16535 sprintf(sc->tshuffle, "tshuffle");
16536 sprintf(sc->sharedStride, "sharedStride");
16537 sprintf(sc->combinedID, "combinedID");
16538 sprintf(sc->inoutID, "inoutID");
16539 sprintf(sc->sdataID, "sdataID");
16540 //sprintf(sc->tempReg, "temp");
16541 sc->disableThreadsStart = (char*)malloc(sizeof(char) * 500);
16542 if (!sc->disableThreadsStart) {
16545 }
16546 sc->disableThreadsEnd = (char*)malloc(sizeof(char) * 2);
16547 if (!sc->disableThreadsEnd) {
16550 }
16551 sc->disableThreadsStart[0] = 0;
16552 sc->disableThreadsEnd[0] = 0;
16553 res = appendVersion(sc);
16554 if (res != VKFFT_SUCCESS) {
16556 return res;
16557 }
16558 res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
16559 if (res != VKFFT_SUCCESS) {
16561 return res;
16562 }
16563 res = appendLayoutVkFFT(sc);
16564 if (res != VKFFT_SUCCESS) {
16566 return res;
16567 }
16568 res = appendConstantsVkFFT(sc, floatType, uintType);
16569 if (res != VKFFT_SUCCESS) {
16571 return res;
16572 }
16573 if ((!sc->LUT) && (!strcmp(floatType, "double"))) {
16574 res = appendSinCos20(sc, floatType, uintType);
16575 if (res != VKFFT_SUCCESS) {
16577 return res;
16578 }
16579 }
16580 if (strcmp(floatType, floatTypeInputMemory)) {
16581 res = appendConversion(sc, floatType, floatTypeInputMemory);
16582 if (res != VKFFT_SUCCESS) {
16584 return res;
16585 }
16586 }
16587 if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
16588 res = appendConversion(sc, floatType, floatTypeOutputMemory);
16589 if (res != VKFFT_SUCCESS) {
16591 return res;
16592 }
16593 }
16594 res = appendPushConstantsVkFFT(sc, floatType, uintType);
16595 if (res != VKFFT_SUCCESS) {
16597 return res;
16598 }
16599 uint64_t id = 0;
16600 res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, type);
16601 if (res != VKFFT_SUCCESS) {
16603 return res;
16604 }
16605 id++;
16606 res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, type);
16607 if (res != VKFFT_SUCCESS) {
16609 return res;
16610 }
16611 id++;
16612 if (sc->convolutionStep) {
16613 res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory);
16614 if (res != VKFFT_SUCCESS) {
16616 return res;
16617 }
16618 id++;
16619 }
16620 if (sc->LUT) {
16621 res = appendLUTLayoutVkFFT(sc, id, floatType);
16622 if (res != VKFFT_SUCCESS) {
16624 return res;
16625 }
16626 id++;
16627 }
16628 if (sc->useBluesteinFFT) {
16629 res = appendBluesteinLayoutVkFFT(sc, id, floatType);
16630 if (res != VKFFT_SUCCESS) {
16632 return res;
16633 }
16635 id++;
16637 id++;
16638 }
16639 //appendIndexInputVkFFT(sc, uintType, type);
16640 //appendIndexOutputVkFFT(sc, uintType, type);
16641 /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 };
16642 for (uint64_t i = 0; i < sc->numStages; i++) {
16643 if (appendedRadix[sc->stageRadix[i]] == 0) {
16644 appendedRadix[sc->stageRadix[i]] = 1;
16645 appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]);
16646 }
16647 }*/
16648 uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type;
16649#if(VKFFT_BACKEND==0)
16650 res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType);
16651 if (res != VKFFT_SUCCESS) {
16653 return res;
16654 }
16655 sc->tempLen = sprintf(sc->tempStr, "void main() {\n");
16656 res = VkAppendLine(sc);
16657 if (res != VKFFT_SUCCESS) {
16659 return res;
16660 }
16661#elif(VKFFT_BACKEND==1)
16662 sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n");
16663 res = VkAppendLine(sc);
16664 if (res != VKFFT_SUCCESS) {
16666 return res;
16667 }
16668 sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ void __launch_bounds__(%" PRIu64 ") VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
16669 res = VkAppendLine(sc);
16670 if (res != VKFFT_SUCCESS) {
16672 return res;
16673 }
16674 switch (type) {
16675 case 5:
16676 {
16677 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16678 break;
16679 }
16680 case 6:
16681 {
16682 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16683 break;
16684 }
16685 case 110:
16686 {
16687 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16688 break;
16689 }
16690 case 111:
16691 {
16692 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16693 break;
16694 }
16695 case 120:
16696 {
16697 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16698 break;
16699 }
16700 case 121:
16701 {
16702 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16703 break;
16704 }
16705 case 130:
16706 {
16707 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16708 break;
16709 }
16710 case 131:
16711 {
16712 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16713 break;
16714 }
16715 case 140:
16716 {
16717 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16718 break;
16719 }
16720 case 141:
16721 {
16722 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16723 break;
16724 }
16725 case 142:
16726 {
16727 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16728 break;
16729 }
16730 case 143:
16731 {
16732 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16733 break;
16734 }
16735 case 144:
16736 {
16737 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16738 break;
16739 }
16740 case 145:
16741 {
16742 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16743 break;
16744 }
16745 default:
16746 {
16747 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16748 break;
16749 }
16750 }
16751 res = VkAppendLine(sc);
16752 if (res != VKFFT_SUCCESS) {
16754 return res;
16755 }
16756
16757 if (sc->convolutionStep) {
16758 sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType);
16759 res = VkAppendLine(sc);
16760 if (res != VKFFT_SUCCESS) {
16762 return res;
16763 }
16764 }
16765 if (sc->LUT) {
16766 sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType);
16767 res = VkAppendLine(sc);
16768 if (res != VKFFT_SUCCESS) {
16770 return res;
16771 }
16772 }
16773 if (sc->BluesteinConvolutionStep) {
16774 sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType);
16775 res = VkAppendLine(sc);
16776 if (res != VKFFT_SUCCESS) {
16778 return res;
16779 }
16780 }
16782 sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType);
16783 res = VkAppendLine(sc);
16784 if (res != VKFFT_SUCCESS) {
16786 return res;
16787 }
16788 }
16789 sc->tempLen = sprintf(sc->tempStr, ") {\n");
16790 res = VkAppendLine(sc);
16791 if (res != VKFFT_SUCCESS) {
16793 return res;
16794 }
16795 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
16796 res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType);
16797 if (res != VKFFT_SUCCESS) {
16799 return res;
16800 }
16801#elif(VKFFT_BACKEND==2)
16802 sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n");
16803 res = VkAppendLine(sc);
16804 if (res != VKFFT_SUCCESS) {
16806 return res;
16807 }
16808 sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
16809 res = VkAppendLine(sc);
16810 if (res != VKFFT_SUCCESS) {
16812 return res;
16813 }
16814 switch (type) {
16815 case 5:
16816 {
16817 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16818 break;
16819 }
16820 case 6:
16821 {
16822 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16823 break;
16824 }
16825 case 110:
16826 {
16827 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16828 break;
16829 }
16830 case 111:
16831 {
16832 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16833 break;
16834 }
16835 case 120:
16836 {
16837 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16838 break;
16839 }
16840 case 121:
16841 {
16842 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16843 break;
16844 }
16845 case 130:
16846 {
16847 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16848 break;
16849 }
16850 case 131:
16851 {
16852 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16853 break;
16854 }
16855 case 140:
16856 {
16857 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16858 break;
16859 }
16860 case 141:
16861 {
16862 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16863 break;
16864 }
16865 case 142:
16866 {
16867 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16868 break;
16869 }
16870 case 143:
16871 {
16872 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16873 break;
16874 }
16875 case 144:
16876 {
16877 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16878 break;
16879 }
16880 case 145:
16881 {
16882 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16883 break;
16884 }
16885 default:
16886 {
16887 sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16888 break;
16889 }
16890 }
16891 res = VkAppendLine(sc);
16892 if (res != VKFFT_SUCCESS) {
16894 return res;
16895 }
16896 if (sc->convolutionStep) {
16897 sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType);
16898 res = VkAppendLine(sc);
16899 if (res != VKFFT_SUCCESS) {
16901 return res;
16902 }
16903 }
16904 if (sc->LUT) {
16905 sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType);
16906 res = VkAppendLine(sc);
16907 if (res != VKFFT_SUCCESS) {
16909 return res;
16910 }
16911 }
16912 if (sc->BluesteinConvolutionStep) {
16913 sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType);
16914 res = VkAppendLine(sc);
16915 if (res != VKFFT_SUCCESS) {
16917 return res;
16918 }
16919 }
16921 sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType);
16922 res = VkAppendLine(sc);
16923 if (res != VKFFT_SUCCESS) {
16925 return res;
16926 }
16927 }
16928 sc->tempLen = sprintf(sc->tempStr, ") {\n");
16929 res = VkAppendLine(sc);
16930 if (res != VKFFT_SUCCESS) {
16932 return res;
16933 }
16934 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
16935 res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType);
16936 if (res != VKFFT_SUCCESS) {
16938 return res;
16939 }
16940#elif(VKFFT_BACKEND==3)
16941 sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main ", sc->localSize[0], sc->localSize[1], sc->localSize[2]);
16942 res = VkAppendLine(sc);
16943 if (res != VKFFT_SUCCESS) {
16945 return res;
16946 }
16947 switch (type) {
16948 case 5:
16949 {
16950 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput);
16951 break;
16952 }
16953 case 6:
16954 {
16955 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory);
16956 break;
16957 }
16958 case 110:
16959 {
16960 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16961 break;
16962 }
16963 case 111:
16964 {
16965 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16966 break;
16967 }
16968 case 120:
16969 {
16970 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16971 break;
16972 }
16973 case 121:
16974 {
16975 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16976 break;
16977 }
16978 case 130:
16979 {
16980 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16981 break;
16982 }
16983 case 131:
16984 {
16985 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16986 break;
16987 }
16988 case 140:
16989 {
16990 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16991 break;
16992 }
16993 case 141:
16994 {
16995 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16996 break;
16997 }
16998 case 142:
16999 {
17000 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17001 break;
17002 }
17003 case 143:
17004 {
17005 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17006 break;
17007 }
17008 case 144:
17009 {
17010 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17011 break;
17012 }
17013 case 145:
17014 {
17015 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17016 break;
17017 }
17018 default:
17019 {
17020 sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
17021 break;
17022 }
17023 }
17024 res = VkAppendLine(sc);
17025 if (res != VKFFT_SUCCESS) {
17027 return res;
17028 }
17029 if (sc->convolutionStep) {
17030 sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType);
17031 res = VkAppendLine(sc);
17032 if (res != VKFFT_SUCCESS) {
17034 return res;
17035 }
17036 }
17037 if (sc->LUT) {
17038 sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType);
17039 res = VkAppendLine(sc);
17040 if (res != VKFFT_SUCCESS) {
17042 return res;
17043 }
17044 }
17045 if (sc->BluesteinConvolutionStep) {
17046 sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinConvolutionKernel", vecType);
17047 res = VkAppendLine(sc);
17048 if (res != VKFFT_SUCCESS) {
17050 return res;
17051 }
17052 }
17054 sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinMultiplication", vecType);
17055 res = VkAppendLine(sc);
17056 if (res != VKFFT_SUCCESS) {
17058 return res;
17059 }
17060 }
17061 sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts");
17062 res = VkAppendLine(sc);
17063 if (res != VKFFT_SUCCESS) {
17065 return res;
17066 }
17067 sc->tempLen = sprintf(sc->tempStr, ") {\n");
17068 res = VkAppendLine(sc);
17069 if (res != VKFFT_SUCCESS) {
17071 return res;
17072 }
17073 //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n");
17074 res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType);
17075 if (res != VKFFT_SUCCESS) {
17077 return res;
17078 }
17079#endif
17080 //if (type==0) sc->tempLen = sprintf(sc->tempStr, "return;\n");
17081 res = appendInitialization(sc, floatType, uintType, type);
17082 if (res != VKFFT_SUCCESS) {
17084 return res;
17085 }
17086 res = setReadToRegisters(sc, type);
17087 if (res != VKFFT_SUCCESS) {
17089 return res;
17090 }
17091 res = setWriteFromRegisters(sc, type);
17092 if (res != VKFFT_SUCCESS) {
17094 return res;
17095 }
17096 if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) {
17097 sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=%" PRIu64 "; coordinate > 0; coordinate--){\n\
17098 coordinate--;\n", uintType, sc->matrixConvolution);
17099 res = VkAppendLine(sc);
17100 if (res != VKFFT_SUCCESS) {
17102 return res;
17103 }
17104 }
17105 res = appendReadDataVkFFT(sc, floatType, floatTypeInputMemory, uintType, type);
17106 if (res != VKFFT_SUCCESS) {
17108 return res;
17109 }
17111 res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 0);
17112 if (res != VKFFT_SUCCESS) {
17114 return res;
17115 }
17116 }
17117 //appendBarrierVkFFT(sc, 1);
17118 res = appendReorder4StepRead(sc, floatType, uintType, locType);
17119 if (res != VKFFT_SUCCESS) {
17121 return res;
17122 }
17123 res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 1);
17124 if (res != VKFFT_SUCCESS) {
17126 return res;
17127 }
17128 uint64_t stageSize = 1;
17129 uint64_t stageSizeSum = 0;
17130 double PI_const = 3.1415926535897932384626433832795;
17131 double stageAngle = (sc->inverse) ? PI_const : -PI_const;
17132 for (uint64_t i = 0; i < sc->numStages; i++) {
17133 if ((i == sc->numStages - 1) && (sc->registerBoost > 1)) {
17134 res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], locType);
17135 if (res != VKFFT_SUCCESS) {
17137 return res;
17138 }
17139 res = appendRegisterBoostShuffle(sc, floatType, stageSize, sc->stageRadix[i - 1], sc->stageRadix[i], stageAngle);
17140 if (res != VKFFT_SUCCESS) {
17142 return res;
17143 }
17144 }
17145 else {
17146
17147 res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], locType);
17148 if (res != VKFFT_SUCCESS) {
17150 return res;
17151 }
17152 switch (sc->stageRadix[i]) {
17153 case 2:
17154 stageSizeSum += stageSize;
17155 break;
17156 case 3:
17157 stageSizeSum += stageSize * 2;
17158 break;
17159 case 4:
17160 stageSizeSum += stageSize * 2;
17161 break;
17162 case 5:
17163 stageSizeSum += stageSize * 4;
17164 break;
17165 case 7:
17166 stageSizeSum += stageSize * 6;
17167 break;
17168 case 8:
17169 stageSizeSum += stageSize * 3;
17170 break;
17171 case 11:
17172 stageSizeSum += stageSize * 10;
17173 break;
17174 case 13:
17175 stageSizeSum += stageSize * 12;
17176 break;
17177 }
17178 if (i == sc->numStages - 1) {
17179 res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], locType);
17180 if (res != VKFFT_SUCCESS) {
17182 return res;
17183 }
17184 }
17185 else {
17186 res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], locType);
17187 if (res != VKFFT_SUCCESS) {
17189 return res;
17190 }
17191 }
17192 stageSize *= sc->stageRadix[i];
17193 stageAngle /= sc->stageRadix[i];
17194 }
17195 }
17196
17197 if ((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) {
17198 res = appendCoordinateRegisterStore(sc, locType);
17199 if (res != VKFFT_SUCCESS) {
17201 return res;
17202 }
17203
17204 if (sc->matrixConvolution > 1) {
17205 sc->tempLen = sprintf(sc->tempStr, " coordinate++;}\n");
17206 res = VkAppendLine(sc);
17207 if (res != VKFFT_SUCCESS) {
17209 return res;
17210 }
17211 }
17212 if (sc->numKernels > 1) {
17213 res = appendPreparationBatchedKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType);
17214 if (res != VKFFT_SUCCESS) {
17216 return res;
17217 }
17218 }
17220 {
17221 res = appendBluesteinConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType);
17222 if (res != VKFFT_SUCCESS) {
17224 return res;
17225 }
17226 }
17227 else {
17228 res = appendKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType);
17229 if (res != VKFFT_SUCCESS) {
17231 return res;
17232 }
17233 }
17234 if (sc->matrixConvolution > 1) {
17235 sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=0; coordinate < %" PRIu64 "; coordinate++){\n", uintType, sc->matrixConvolution);
17236 res = VkAppendLine(sc);
17237 if (res != VKFFT_SUCCESS) {
17239 return res;
17240 }
17241 }
17242 res = appendCoordinateRegisterPull(sc, locType);
17243 if (res != VKFFT_SUCCESS) {
17245 return res;
17246 }
17247
17248 stageSize = 1;
17249 stageSizeSum = 0;
17250 stageAngle = PI_const;
17251 sc->inverse = 1;
17252 for (uint64_t i = 0; i < sc->numStages; i++) {
17253 res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], locType);
17254 if (res != VKFFT_SUCCESS) {
17256 return res;
17257 }
17258 switch (sc->stageRadix[i]) {
17259 case 2:
17260 stageSizeSum += stageSize;
17261 break;
17262 case 3:
17263 stageSizeSum += stageSize * 2;
17264 break;
17265 case 4:
17266 stageSizeSum += stageSize * 2;
17267 break;
17268 case 5:
17269 stageSizeSum += stageSize * 4;
17270 break;
17271 case 7:
17272 stageSizeSum += stageSize * 6;
17273 break;
17274 case 8:
17275 stageSizeSum += stageSize * 3;
17276 break;
17277 case 11:
17278 stageSizeSum += stageSize * 10;
17279 break;
17280 case 13:
17281 stageSizeSum += stageSize * 12;
17282 break;
17283 }
17284 if (i == sc->numStages - 1) {
17285 res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], locType);
17286 if (res != VKFFT_SUCCESS) {
17288 return res;
17289 }
17290 }
17291 else {
17292 res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], locType);
17293 if (res != VKFFT_SUCCESS) {
17295 return res;
17296 }
17297 }
17298 stageSize *= sc->stageRadix[i];
17299 stageAngle /= sc->stageRadix[i];
17300 }
17301 }
17302 res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 0);
17303 if (res != VKFFT_SUCCESS) {
17305 return res;
17306 }
17307 res = appendReorder4StepWrite(sc, floatType, uintType, locType);
17308 if (res != VKFFT_SUCCESS) {
17310 return res;
17311 }
17313 res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 1);
17314 if (res != VKFFT_SUCCESS) {
17316 return res;
17317 }
17318 }
17319 res = appendWriteDataVkFFT(sc, floatType, floatTypeOutputMemory, uintType, type);
17320 if (res != VKFFT_SUCCESS) {
17322 return res;
17323 }
17324 if ((sc->convolutionStep) && (sc->matrixConvolution > 1))
17325 {
17326 sc->tempLen = sprintf(sc->tempStr, " }\n");
17327 res = VkAppendLine(sc);
17328 if (res != VKFFT_SUCCESS) {
17330 return res;
17331 }
17332 }
17333 if ((sc->convolutionStep) && (sc->numKernels > 1))
17334 {
17335 sc->tempLen = sprintf(sc->tempStr, " }\n");
17336 res = VkAppendLine(sc);
17337 if (res != VKFFT_SUCCESS) {
17339 return res;
17340 }
17341 }
17342 sc->tempLen = sprintf(sc->tempStr, "}\n");
17343 res = VkAppendLine(sc);
17344 if (res != VKFFT_SUCCESS) {
17346 return res;
17347 }
17349 //if (sc->useBluesteinFFT)
17350 //printf("%s", output);
17351 return res;
17352}
17353#if(VKFFT_BACKEND==0)
17354static inline VkFFTResult findMemoryType(VkFFTApplication* app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex) {
17355 VkPhysicalDeviceMemoryProperties memoryProperties = { 0 };
17356
17357 vkGetPhysicalDeviceMemoryProperties(app->configuration.physicalDevice[0], &memoryProperties);
17358
17359 for (uint64_t i = 0; i < memoryProperties.memoryTypeCount; ++i) {
17360 if ((memoryTypeBits & ((uint64_t)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize))
17361 {
17362 memoryTypeIndex[0] = (uint32_t)i;
17363 return VKFFT_SUCCESS;
17364 }
17365 }
17367}
17368static inline VkFFTResult allocateFFTBuffer(VkFFTApplication* app, VkBuffer* buffer, VkDeviceMemory* deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, VkDeviceSize size) {
17369 VkFFTResult resFFT = VKFFT_SUCCESS;
17370 VkResult res = VK_SUCCESS;
17371 uint32_t queueFamilyIndices;
17372 VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
17373 bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
17374 bufferCreateInfo.queueFamilyIndexCount = 1;
17375 bufferCreateInfo.pQueueFamilyIndices = &queueFamilyIndices;
17376 bufferCreateInfo.size = size;
17377 bufferCreateInfo.usage = usageFlags;
17378 res = vkCreateBuffer(app->configuration.device[0], &bufferCreateInfo, 0, buffer);
17379 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_BUFFER;
17380 VkMemoryRequirements memoryRequirements = { 0 };
17381 vkGetBufferMemoryRequirements(app->configuration.device[0], buffer[0], &memoryRequirements);
17382 VkMemoryAllocateInfo memoryAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
17383 memoryAllocateInfo.allocationSize = memoryRequirements.size;
17384 resFFT = findMemoryType(app, memoryRequirements.memoryTypeBits, memoryRequirements.size, propertyFlags, &memoryAllocateInfo.memoryTypeIndex);
17385 if (resFFT != VKFFT_SUCCESS) return resFFT;
17386 res = vkAllocateMemory(app->configuration.device[0], &memoryAllocateInfo, 0, deviceMemory);
17387 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY;
17388 res = vkBindBufferMemory(app->configuration.device[0], buffer[0], deviceMemory[0], 0);
17389 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY;
17390 return resFFT;
17391}
17392static inline VkFFTResult transferDataFromCPU(VkFFTApplication* app, void* arr, VkBuffer* buffer, VkDeviceSize bufferSize) {
17393 VkResult res = VK_SUCCESS;
17394 VkFFTResult resFFT = VKFFT_SUCCESS;
17395 VkDeviceSize stagingBufferSize = bufferSize;
17396 VkBuffer stagingBuffer = { 0 };
17397 VkDeviceMemory stagingBufferMemory = { 0 };
17398 resFFT = allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
17399 if (resFFT != VKFFT_SUCCESS) return resFFT;
17400 void* data;
17401 res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data);
17402 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_MAP_MEMORY;
17403 memcpy(data, arr, stagingBufferSize);
17404 vkUnmapMemory(app->configuration.device[0], stagingBufferMemory);
17405 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
17406 commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0];
17407 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
17408 commandBufferAllocateInfo.commandBufferCount = 1;
17409 VkCommandBuffer commandBuffer = { 0 };
17410 res = vkAllocateCommandBuffers(app->configuration.device[0], &commandBufferAllocateInfo, &commandBuffer);
17411 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS;
17412 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
17413 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
17414 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
17415 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER;
17416 VkBufferCopy copyRegion = { 0 };
17417 copyRegion.srcOffset = 0;
17418 copyRegion.dstOffset = 0;
17419 copyRegion.size = stagingBufferSize;
17420 vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, &copyRegion);
17421 res = vkEndCommandBuffer(commandBuffer);
17422 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
17423 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
17424 submitInfo.commandBufferCount = 1;
17425 submitInfo.pCommandBuffers = &commandBuffer;
17426 res = vkQueueSubmit(app->configuration.queue[0], 1, &submitInfo, app->configuration.fence[0]);
17427 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE;
17428 res = vkWaitForFences(app->configuration.device[0], 1, app->configuration.fence, VK_TRUE, 100000000000);
17429 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES;
17430 res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence);
17431 if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
17432 vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer);
17433 vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0);
17434 vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0);
17435 return resFFT;
17436}
17437#endif
17438static inline void deleteAxis(VkFFTApplication* app, VkFFTAxis* axis) {
17439#if(VKFFT_BACKEND==0)
17440 if ((app->configuration.useLUT) && (!axis->referenceLUT)) {
17441 if (axis->bufferLUT != 0) {
17442 vkDestroyBuffer(app->configuration.device[0], axis->bufferLUT, 0);
17443 axis->bufferLUT = 0;
17444 }
17445 if (axis->bufferLUTDeviceMemory != 0) {
17446 vkFreeMemory(app->configuration.device[0], axis->bufferLUTDeviceMemory, 0);
17447 axis->bufferLUTDeviceMemory = 0;
17448 }
17449 }
17450 if (axis->descriptorPool != 0) {
17451 vkDestroyDescriptorPool(app->configuration.device[0], axis->descriptorPool, 0);
17452 axis->descriptorPool = 0;
17453 }
17454 if (axis->descriptorSetLayout != 0) {
17455 vkDestroyDescriptorSetLayout(app->configuration.device[0], axis->descriptorSetLayout, 0);
17456 axis->descriptorSetLayout = 0;
17457 }
17458 if (axis->pipelineLayout != 0) {
17459 vkDestroyPipelineLayout(app->configuration.device[0], axis->pipelineLayout, 0);
17460 axis->pipelineLayout = 0;
17461 }
17462 if (axis->pipeline != 0) {
17463 vkDestroyPipeline(app->configuration.device[0], axis->pipeline, 0);
17464 axis->pipeline = 0;
17465 }
17466#elif(VKFFT_BACKEND==1)
17467 CUresult res = CUDA_SUCCESS;
17468 cudaError_t res_t = cudaSuccess;
17469 if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) {
17470 res_t = cudaFree(axis->bufferLUT);
17471 axis->bufferLUT = 0;
17472 }
17473 if (axis->VkFFTModule != 0) {
17474 res = cuModuleUnload(axis->VkFFTModule);
17475 axis->VkFFTModule = 0;
17476 }
17477#elif(VKFFT_BACKEND==2)
17478 hipError_t res = hipSuccess;
17479 if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) {
17480 res = hipFree(axis->bufferLUT);
17481 axis->bufferLUT = 0;
17482 }
17483 if (axis->VkFFTModule != 0) {
17484 res = hipModuleUnload(axis->VkFFTModule);
17485 axis->VkFFTModule = 0;
17486 }
17487#elif(VKFFT_BACKEND==3)
17488 cl_int res = 0;
17489 if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) {
17490 res = clReleaseMemObject(axis->bufferLUT);
17491 axis->bufferLUT = 0;
17492 }
17493 if (axis->program != 0) {
17494 res = clReleaseProgram(axis->program);
17495 axis->program = 0;
17496 }
17497 if (axis->kernel != 0) {
17498 res = clReleaseKernel(axis->kernel);
17499 axis->kernel = 0;
17500 }
17501#endif
17502}
17503static inline void deleteVkFFT(VkFFTApplication* app) {
17504#if(VKFFT_BACKEND==0)
17506 glslang_finalize_process();
17508 }
17509#elif(VKFFT_BACKEND==1)
17510 CUresult res = CUDA_SUCCESS;
17511 cudaError_t res_t = cudaSuccess;
17512 if (app->configuration.num_streams > 1) {
17513 for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
17514 if (app->configuration.stream_event[i] != 0) {
17515 res_t = cudaEventDestroy(app->configuration.stream_event[i]);
17516 app->configuration.stream_event[i] = 0;
17517 }
17518 }
17519 if (app->configuration.stream_event != 0) {
17520 free(app->configuration.stream_event);
17521 app->configuration.stream_event = 0;
17522 }
17523 }
17524#elif(VKFFT_BACKEND==2)
17525 hipError_t res_t = hipSuccess;
17526 if (app->configuration.num_streams > 1) {
17527 for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
17528 if (app->configuration.stream_event[i] != 0) {
17529 res_t = hipEventDestroy(app->configuration.stream_event[i]);
17530 app->configuration.stream_event[i] = 0;
17531 }
17532 }
17533 if (app->configuration.stream_event != 0) {
17534 free(app->configuration.stream_event);
17535 app->configuration.stream_event = 0;
17536 }
17537 }
17538#endif
17539 if (!app->configuration.userTempBuffer) {
17542#if(VKFFT_BACKEND==0)
17543 if (app->configuration.tempBuffer[0] != 0) {
17544 vkDestroyBuffer(app->configuration.device[0], app->configuration.tempBuffer[0], 0);
17545 app->configuration.tempBuffer[0] = 0;
17546 }
17547 if (app->configuration.tempBufferDeviceMemory != 0) {
17548 vkFreeMemory(app->configuration.device[0], app->configuration.tempBufferDeviceMemory, 0);
17550 }
17551#elif(VKFFT_BACKEND==1)
17552 if (app->configuration.tempBuffer[0] != 0) {
17553 res_t = cudaFree(app->configuration.tempBuffer[0]);
17554 app->configuration.tempBuffer[0] = 0;
17555 }
17556#elif(VKFFT_BACKEND==2)
17557 if (app->configuration.tempBuffer[0] != 0) {
17558 res_t = hipFree(app->configuration.tempBuffer[0]);
17559 app->configuration.tempBuffer[0] = 0;
17560 }
17561#elif(VKFFT_BACKEND==3)
17562 cl_int res = 0;
17563 if (app->configuration.tempBuffer[0] != 0) {
17564 res = clReleaseMemObject(app->configuration.tempBuffer[0]);
17565 app->configuration.tempBuffer[0] = 0;
17566 }
17567#endif
17568 if (app->configuration.tempBuffer != 0) {
17569 free(app->configuration.tempBuffer);
17570 app->configuration.tempBuffer = 0;
17571 }
17572 }
17573 if (app->configuration.tempBufferSize != 0) {
17574 free(app->configuration.tempBufferSize);
17576 }
17577 }
17578 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
17579 if (app->useBluesteinFFT[i]) {
17580#if(VKFFT_BACKEND==0)
17581 if (app->bufferBluestein[i] != 0) {
17582 vkDestroyBuffer(app->configuration.device[0], app->bufferBluestein[i], 0);
17583 app->bufferBluestein[i] = 0;
17584 }
17585 if (app->bufferBluesteinDeviceMemory[i] != 0) {
17586 vkFreeMemory(app->configuration.device[0], app->bufferBluesteinDeviceMemory[i], 0);
17587 app->bufferBluesteinDeviceMemory[i] = 0;
17588 }
17589 if (app->bufferBluesteinFFT[i] != 0) {
17590 vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinFFT[i], 0);
17591 app->bufferBluesteinFFT[i] = 0;
17592 }
17593 if (app->bufferBluesteinFFTDeviceMemory[i] != 0) {
17594 vkFreeMemory(app->configuration.device[0], app->bufferBluesteinFFTDeviceMemory[i], 0);
17596 }
17597 if (app->bufferBluesteinIFFT[i] != 0) {
17598 vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinIFFT[i], 0);
17599 app->bufferBluesteinIFFT[i] = 0;
17600 }
17601 if (app->bufferBluesteinIFFTDeviceMemory[i] != 0) {
17602 vkFreeMemory(app->configuration.device[0], app->bufferBluesteinIFFTDeviceMemory[i], 0);
17604 }
17605#elif(VKFFT_BACKEND==1)
17606 if (app->bufferBluestein[i] != 0) {
17607 res_t = cudaFree(app->bufferBluestein[i]);
17608 app->bufferBluestein[i] = 0;
17609 }
17610 if (app->bufferBluesteinFFT[i] != 0) {
17611 res_t = cudaFree(app->bufferBluesteinFFT[i]);
17612 app->bufferBluesteinFFT[i] = 0;
17613 }
17614 if (app->bufferBluesteinIFFT[i] != 0) {
17615 res_t = cudaFree(app->bufferBluesteinIFFT[i]);
17616 app->bufferBluesteinIFFT[i] = 0;
17617 }
17618#elif(VKFFT_BACKEND==2)
17619 if (app->bufferBluestein[i] != 0) {
17620 res_t = hipFree(app->bufferBluestein[i]);
17621 app->bufferBluestein[i] = 0;
17622 }
17623 if (app->bufferBluesteinFFT[i] != 0) {
17624 res_t = hipFree(app->bufferBluesteinFFT[i]);
17625 app->bufferBluesteinFFT[i] = 0;
17626 }
17627 if (app->bufferBluesteinIFFT[i] != 0) {
17628 res_t = hipFree(app->bufferBluesteinIFFT[i]);
17629 app->bufferBluesteinIFFT[i] = 0;
17630 }
17631#elif(VKFFT_BACKEND==3)
17632 cl_int res = 0;
17633 if (app->bufferBluestein[i] != 0) {
17634 res = clReleaseMemObject(app->bufferBluestein[i]);
17635 app->bufferBluestein[i] = 0;
17636 }
17637 if (app->bufferBluesteinFFT[i] != 0) {
17638 res = clReleaseMemObject(app->bufferBluesteinFFT[i]);
17639 app->bufferBluesteinFFT[i] = 0;
17640 }
17641 if (app->bufferBluesteinIFFT[i] != 0) {
17642 res = clReleaseMemObject(app->bufferBluesteinIFFT[i]);
17643 app->bufferBluesteinIFFT[i] = 0;
17644 }
17645#endif
17646 }
17647 }
17649 if (app->localFFTPlan != 0) {
17650 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
17651 if (app->localFFTPlan->numAxisUploads[i] > 0) {
17652 for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
17653 deleteAxis(app, &app->localFFTPlan->axes[i][j]);
17654 }
17655 }
17656 if (app->localFFTPlan->multiUploadR2C) {
17658 }
17659 if (app->localFFTPlan != 0) {
17660 free(app->localFFTPlan);
17661 app->localFFTPlan = 0;
17662 }
17663 }
17664 }
17666 if (app->localFFTPlan_inverse != 0) {
17667 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
17668 if (app->localFFTPlan_inverse->numAxisUploads[i] > 0) {
17669 for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
17670 deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j]);
17671 }
17672 }
17675 }
17676 if (app->localFFTPlan_inverse != 0) {
17677 free(app->localFFTPlan_inverse);
17678 app->localFFTPlan_inverse = 0;
17679 }
17680 }
17681 }
17682}
17683static inline VkFFTResult VkFFTGetRegistersPerThread(uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) {
17684 for (uint64_t i = 0; i < 14; i++) {
17685 registers_per_thread_per_radix[i] = 0;
17686 }
17687 registers_per_thread[0] = 0;
17688 min_registers_per_thread[0] = -1;
17689
17690 if (loc_multipliers[2] > 0) {
17691 if (loc_multipliers[3] > 0) {
17692 if (loc_multipliers[5] > 0) {
17693 if (loc_multipliers[7] > 0) {
17694 if (loc_multipliers[11] > 0) {
17695 if (loc_multipliers[13] > 0) {
17696 switch (loc_multipliers[2]) {
17697 case 1:
17698 registers_per_thread_per_radix[2] = 14;
17699 registers_per_thread_per_radix[3] = 15;
17700 break;
17701 case 2:
17702 registers_per_thread_per_radix[2] = 12;
17703 registers_per_thread_per_radix[3] = 12;
17704 break;
17705 case 3:
17706 registers_per_thread_per_radix[2] = 12;
17707 registers_per_thread_per_radix[3] = 12;
17708 break;
17709 default:
17710 registers_per_thread_per_radix[2] = 16;
17711 registers_per_thread_per_radix[3] = 12;
17712 break;
17713 }
17714 registers_per_thread_per_radix[5] = 15;
17715 registers_per_thread_per_radix[7] = 14;
17716 registers_per_thread_per_radix[11] = 11;
17717 registers_per_thread_per_radix[13] = 13;
17718 }
17719 else {
17720 switch (loc_multipliers[2]) {
17721 case 1:
17722 registers_per_thread_per_radix[2] = 14;
17723 registers_per_thread_per_radix[3] = 15;
17724 break;
17725 case 2:
17726 registers_per_thread_per_radix[2] = 12;
17727 registers_per_thread_per_radix[3] = 12;
17728 break;
17729 case 3:
17730 registers_per_thread_per_radix[2] = 12;
17731 registers_per_thread_per_radix[3] = 12;
17732 break;
17733 default:
17734 registers_per_thread_per_radix[2] = 16;
17735 registers_per_thread_per_radix[3] = 12;
17736 break;
17737 }
17738 registers_per_thread_per_radix[5] = 15;
17739 registers_per_thread_per_radix[7] = 14;
17740 registers_per_thread_per_radix[11] = 11;
17741 registers_per_thread_per_radix[13] = 0;
17742 }
17743 }
17744 else {
17745 if (loc_multipliers[13] > 0) {
17746 switch (loc_multipliers[2]) {
17747 case 1:
17748 registers_per_thread_per_radix[2] = 14;
17749 registers_per_thread_per_radix[3] = 15;
17750 break;
17751 case 2:
17752 registers_per_thread_per_radix[2] = 12;
17753 registers_per_thread_per_radix[3] = 12;
17754 break;
17755 case 3:
17756 registers_per_thread_per_radix[2] = 12;
17757 registers_per_thread_per_radix[3] = 12;
17758 break;
17759 default:
17760 registers_per_thread_per_radix[2] = 16;
17761 registers_per_thread_per_radix[3] = 12;
17762 break;
17763 }
17764 registers_per_thread_per_radix[5] = 15;
17765 registers_per_thread_per_radix[7] = 14;
17766 registers_per_thread_per_radix[11] = 0;
17767 registers_per_thread_per_radix[13] = 13;
17768
17769 }
17770 else {
17771 switch (loc_multipliers[2]) {
17772 case 1:
17773 registers_per_thread_per_radix[2] = 14;
17774 registers_per_thread_per_radix[3] = 15;
17775
17776 break;
17777 case 2:
17778 registers_per_thread_per_radix[2] = 12;
17779 registers_per_thread_per_radix[3] = 12;
17780 break;
17781 case 3:
17782 registers_per_thread_per_radix[2] = 12;
17783 registers_per_thread_per_radix[3] = 12;
17784 break;
17785 default:
17786 registers_per_thread_per_radix[2] = 16;
17787 registers_per_thread_per_radix[3] = 12;
17788 break;
17789 }
17790 registers_per_thread_per_radix[5] = 15;
17791 registers_per_thread_per_radix[7] = 14;
17792 registers_per_thread_per_radix[11] = 0;
17793 registers_per_thread_per_radix[13] = 0;
17794 }
17795 }
17796 }
17797 else {
17798 if (loc_multipliers[11] > 0) {
17799 if (loc_multipliers[13] > 0) {
17800 switch (loc_multipliers[2]) {
17801 case 1:
17802 registers_per_thread_per_radix[2] = 10;
17803 registers_per_thread_per_radix[3] = 15;
17804 break;
17805 case 2:
17806 registers_per_thread_per_radix[2] = 12;
17807 registers_per_thread_per_radix[3] = 12;
17808 break;
17809 default:
17810 registers_per_thread_per_radix[2] = 12;
17811 registers_per_thread_per_radix[3] = 12;
17812 break;
17813 }
17814 registers_per_thread_per_radix[5] = 10;
17815 registers_per_thread_per_radix[7] = 0;
17816 registers_per_thread_per_radix[11] = 11;
17817 registers_per_thread_per_radix[13] = 13;
17818 }
17819 else {
17820 switch (loc_multipliers[2]) {
17821 case 1:
17822 registers_per_thread_per_radix[2] = 10;
17823 registers_per_thread_per_radix[3] = 15;
17824 break;
17825 case 2:
17826 registers_per_thread_per_radix[2] = 12;
17827 registers_per_thread_per_radix[3] = 12;
17828 break;
17829 default:
17830 registers_per_thread_per_radix[2] = 12;
17831 registers_per_thread_per_radix[3] = 12;
17832 break;
17833 }
17834 registers_per_thread_per_radix[5] = 10;
17835 registers_per_thread_per_radix[7] = 0;
17836 registers_per_thread_per_radix[11] = 11;
17837 registers_per_thread_per_radix[13] = 0;
17838 }
17839 }
17840 else {
17841 if (loc_multipliers[13] > 0) {
17842 switch (loc_multipliers[2]) {
17843 case 1:
17844 registers_per_thread_per_radix[2] = 10;
17845 registers_per_thread_per_radix[3] = 15;
17846 break;
17847 case 2:
17848 registers_per_thread_per_radix[2] = 12;
17849 registers_per_thread_per_radix[3] = 12;
17850 break;
17851 default:
17852 registers_per_thread_per_radix[2] = 12;
17853 registers_per_thread_per_radix[3] = 12;
17854 break;
17855 }
17856 registers_per_thread_per_radix[5] = 10;
17857 registers_per_thread_per_radix[7] = 0;
17858 registers_per_thread_per_radix[11] = 0;
17859 registers_per_thread_per_radix[13] = 13;
17860 }
17861 else {
17862 switch (loc_multipliers[2]) {
17863 case 1:
17864 registers_per_thread_per_radix[2] = 6;
17865 registers_per_thread_per_radix[3] = 6;
17866 registers_per_thread_per_radix[5] = 5;
17867 break;
17868 case 2:
17869 registers_per_thread_per_radix[2] = 12;
17870 registers_per_thread_per_radix[3] = 12;
17871 registers_per_thread_per_radix[5] = 10;
17872 break;
17873 default:
17874 registers_per_thread_per_radix[2] = 12;
17875 registers_per_thread_per_radix[3] = 12;
17876 registers_per_thread_per_radix[5] = 10;
17877 break;
17878 }
17879 registers_per_thread_per_radix[7] = 0;
17880 registers_per_thread_per_radix[11] = 0;
17881 registers_per_thread_per_radix[13] = 0;
17882
17883 }
17884 }
17885 }
17886 }
17887 else
17888 {
17889 if (loc_multipliers[7] > 0) {
17890 if (loc_multipliers[11] > 0) {
17891 if (loc_multipliers[13] > 0) {
17892 switch (loc_multipliers[2]) {
17893 case 1:
17894 registers_per_thread_per_radix[2] = 22;
17895 registers_per_thread_per_radix[3] = 21;
17896 registers_per_thread_per_radix[5] = 0;
17897 registers_per_thread_per_radix[7] = 21;
17898 registers_per_thread_per_radix[11] = 22;
17899 registers_per_thread_per_radix[13] = 26;
17900 break;
17901 case 2:
17902 registers_per_thread_per_radix[2] = 12;
17903 registers_per_thread_per_radix[3] = 12;
17904 registers_per_thread_per_radix[5] = 0;
17905 registers_per_thread_per_radix[7] = 14;
17906 registers_per_thread_per_radix[11] = 11;
17907 registers_per_thread_per_radix[13] = 13;
17908 break;
17909 default:
17910 registers_per_thread_per_radix[2] = 12;
17911 registers_per_thread_per_radix[3] = 12;
17912 registers_per_thread_per_radix[5] = 0;
17913 registers_per_thread_per_radix[7] = 14;
17914 registers_per_thread_per_radix[11] = 11;
17915 registers_per_thread_per_radix[13] = 13;
17916 break;
17917 }
17918 }
17919 else {
17920 switch (loc_multipliers[2]) {
17921 case 1:
17922 registers_per_thread_per_radix[2] = 22;
17923 registers_per_thread_per_radix[3] = 21;
17924 registers_per_thread_per_radix[5] = 0;
17925 registers_per_thread_per_radix[7] = 21;
17926 registers_per_thread_per_radix[11] = 22;
17927 registers_per_thread_per_radix[13] = 0;
17928 break;
17929 case 2:
17930 registers_per_thread_per_radix[2] = 12;
17931 registers_per_thread_per_radix[3] = 12;
17932 registers_per_thread_per_radix[5] = 0;
17933 registers_per_thread_per_radix[7] = 14;
17934 registers_per_thread_per_radix[11] = 11;
17935 registers_per_thread_per_radix[13] = 0;
17936 break;
17937 default:
17938 registers_per_thread_per_radix[2] = 12;
17939 registers_per_thread_per_radix[3] = 12;
17940 registers_per_thread_per_radix[5] = 0;
17941 registers_per_thread_per_radix[7] = 14;
17942 registers_per_thread_per_radix[11] = 11;
17943 registers_per_thread_per_radix[13] = 0;
17944 break;
17945 }
17946 }
17947 }
17948 else {
17949 if (loc_multipliers[13] > 0) {
17950 switch (loc_multipliers[2]) {
17951 case 1:
17952 registers_per_thread_per_radix[2] = 26;
17953 registers_per_thread_per_radix[3] = 21;
17954 registers_per_thread_per_radix[5] = 0;
17955 registers_per_thread_per_radix[7] = 21;
17956 registers_per_thread_per_radix[11] = 0;
17957 registers_per_thread_per_radix[13] = 26;
17958 break;
17959 case 2:
17960 registers_per_thread_per_radix[2] = 12;
17961 registers_per_thread_per_radix[3] = 12;
17962 registers_per_thread_per_radix[5] = 0;
17963 registers_per_thread_per_radix[7] = 14;
17964 registers_per_thread_per_radix[11] = 0;
17965 registers_per_thread_per_radix[13] = 13;
17966 break;
17967 default:
17968 registers_per_thread_per_radix[2] = 12;
17969 registers_per_thread_per_radix[3] = 12;
17970 registers_per_thread_per_radix[5] = 0;
17971 registers_per_thread_per_radix[7] = 14;
17972 registers_per_thread_per_radix[11] = 0;
17973 registers_per_thread_per_radix[13] = 13;
17974 break;
17975 }
17976 }
17977 else {
17978 switch (loc_multipliers[2]) {
17979 case 1:
17980 registers_per_thread_per_radix[2] = 6;
17981 registers_per_thread_per_radix[3] = 6;
17982 registers_per_thread_per_radix[5] = 0;
17983 registers_per_thread_per_radix[7] = 7;
17984 registers_per_thread_per_radix[11] = 0;
17985 registers_per_thread_per_radix[13] = 0;
17986 break;
17987 case 2:
17988 registers_per_thread_per_radix[2] = 6;
17989 registers_per_thread_per_radix[3] = 6;
17990 registers_per_thread_per_radix[5] = 0;
17991 registers_per_thread_per_radix[7] = 7;
17992 registers_per_thread_per_radix[11] = 0;
17993 registers_per_thread_per_radix[13] = 0;
17994 break;
17995 default:
17996 registers_per_thread_per_radix[2] = 8;
17997 registers_per_thread_per_radix[3] = 6;
17998 registers_per_thread_per_radix[5] = 0;
17999 registers_per_thread_per_radix[7] = 7;
18000 registers_per_thread_per_radix[11] = 0;
18001 registers_per_thread_per_radix[13] = 0;
18002 break;
18003 }
18004 }
18005 }
18006 }
18007 else {
18008 if (loc_multipliers[11] > 0) {
18009 if (loc_multipliers[13] > 0) {
18010 switch (loc_multipliers[2]) {
18011 case 1:
18012 registers_per_thread_per_radix[2] = 6;
18013 registers_per_thread_per_radix[3] = 6;
18014 registers_per_thread_per_radix[5] = 0;
18015 registers_per_thread_per_radix[7] = 0;
18016 registers_per_thread_per_radix[11] = 11;
18017 registers_per_thread_per_radix[13] = 13;
18018 break;
18019 case 2:
18020 registers_per_thread_per_radix[2] = 12;
18021 registers_per_thread_per_radix[3] = 12;
18022 registers_per_thread_per_radix[5] = 0;
18023 registers_per_thread_per_radix[7] = 0;
18024 registers_per_thread_per_radix[11] = 11;
18025 registers_per_thread_per_radix[13] = 13;
18026 break;
18027 default:
18028 registers_per_thread_per_radix[2] = 12;
18029 registers_per_thread_per_radix[3] = 12;
18030 registers_per_thread_per_radix[5] = 0;
18031 registers_per_thread_per_radix[7] = 0;
18032 registers_per_thread_per_radix[11] = 11;
18033 registers_per_thread_per_radix[13] = 13;
18034 break;
18035 }
18036 }
18037 else {
18038 switch (loc_multipliers[2]) {
18039 case 1:
18040 registers_per_thread_per_radix[2] = 6;
18041 registers_per_thread_per_radix[3] = 6;
18042 registers_per_thread_per_radix[5] = 0;
18043 registers_per_thread_per_radix[7] = 0;
18044 registers_per_thread_per_radix[11] = 11;
18045 registers_per_thread_per_radix[13] = 0;
18046 break;
18047 case 2:
18048 registers_per_thread_per_radix[2] = 12;
18049 registers_per_thread_per_radix[3] = 12;
18050 registers_per_thread_per_radix[5] = 0;
18051 registers_per_thread_per_radix[7] = 0;
18052 registers_per_thread_per_radix[11] = 11;
18053 registers_per_thread_per_radix[13] = 0;
18054 break;
18055 default:
18056 registers_per_thread_per_radix[2] = 12;
18057 registers_per_thread_per_radix[3] = 12;
18058 registers_per_thread_per_radix[5] = 0;
18059 registers_per_thread_per_radix[7] = 0;
18060 registers_per_thread_per_radix[11] = 11;
18061 registers_per_thread_per_radix[13] = 0;
18062 break;
18063 }
18064 }
18065 }
18066 else {
18067 if (loc_multipliers[13] > 0) {
18068 switch (loc_multipliers[2]) {
18069 case 1:
18070 registers_per_thread_per_radix[2] = 6;
18071 registers_per_thread_per_radix[3] = 6;
18072 registers_per_thread_per_radix[5] = 0;
18073 registers_per_thread_per_radix[7] = 0;
18074 registers_per_thread_per_radix[11] = 0;
18075 registers_per_thread_per_radix[13] = 13;
18076 break;
18077 case 2:
18078 registers_per_thread_per_radix[2] = 12;
18079 registers_per_thread_per_radix[3] = 12;
18080 registers_per_thread_per_radix[5] = 0;
18081 registers_per_thread_per_radix[7] = 0;
18082 registers_per_thread_per_radix[11] = 0;
18083 registers_per_thread_per_radix[13] = 13;
18084 break;
18085 default:
18086 registers_per_thread_per_radix[2] = 12;
18087 registers_per_thread_per_radix[3] = 12;
18088 registers_per_thread_per_radix[5] = 0;
18089 registers_per_thread_per_radix[7] = 0;
18090 registers_per_thread_per_radix[11] = 0;
18091 registers_per_thread_per_radix[13] = 13;
18092 break;
18093 }
18094 }
18095 else {
18096 switch (loc_multipliers[2]) {
18097 case 1:
18098 registers_per_thread_per_radix[2] = 6;
18099 registers_per_thread_per_radix[3] = 6;
18100 registers_per_thread_per_radix[5] = 0;
18101 registers_per_thread_per_radix[7] = 0;
18102 registers_per_thread_per_radix[11] = 0;
18103 registers_per_thread_per_radix[13] = 0;
18104 break;
18105 case 2:
18106 registers_per_thread_per_radix[2] = 12;
18107 registers_per_thread_per_radix[3] = 12;
18108 registers_per_thread_per_radix[5] = 0;
18109 registers_per_thread_per_radix[7] = 0;
18110 registers_per_thread_per_radix[11] = 0;
18111 registers_per_thread_per_radix[13] = 0;
18112 break;
18113 default:
18114 registers_per_thread_per_radix[2] = 12;
18115 registers_per_thread_per_radix[3] = 12;
18116 registers_per_thread_per_radix[5] = 0;
18117 registers_per_thread_per_radix[7] = 0;
18118 registers_per_thread_per_radix[11] = 0;
18119 registers_per_thread_per_radix[13] = 0;
18120 break;
18121 }
18122 }
18123 }
18124 }
18125 }
18126 }
18127 else {
18128 if (loc_multipliers[5] > 0) {
18129 if (loc_multipliers[7] > 0) {
18130 if (loc_multipliers[11] > 0) {
18131 if (loc_multipliers[13] > 0) {
18132 switch (loc_multipliers[2]) {
18133 case 1:
18134 registers_per_thread_per_radix[2] = 10;
18135 registers_per_thread_per_radix[3] = 0;
18136 registers_per_thread_per_radix[5] = 10;
18137 registers_per_thread_per_radix[7] = 14;
18138 registers_per_thread_per_radix[11] = 11;
18139 registers_per_thread_per_radix[13] = 13;
18140 break;
18141 case 2:
18142 registers_per_thread_per_radix[2] = 10;
18143 registers_per_thread_per_radix[3] = 0;
18144 registers_per_thread_per_radix[5] = 10;
18145 registers_per_thread_per_radix[7] = 14;
18146 registers_per_thread_per_radix[11] = 11;
18147 registers_per_thread_per_radix[13] = 13;
18148 break;
18149 case 3:
18150 registers_per_thread_per_radix[2] = 8;
18151 registers_per_thread_per_radix[3] = 0;
18152 registers_per_thread_per_radix[5] = 10;
18153 registers_per_thread_per_radix[7] = 14;
18154 registers_per_thread_per_radix[11] = 11;
18155 registers_per_thread_per_radix[13] = 13;
18156 break;
18157 default:
18158 registers_per_thread_per_radix[2] = 16;
18159 registers_per_thread_per_radix[3] = 0;
18160 registers_per_thread_per_radix[5] = 10;
18161 registers_per_thread_per_radix[7] = 14;
18162 registers_per_thread_per_radix[11] = 11;
18163 registers_per_thread_per_radix[13] = 13;
18164 break;
18165 }
18166 }
18167 else {
18168 switch (loc_multipliers[2]) {
18169 case 1:
18170 registers_per_thread_per_radix[2] = 10;
18171 registers_per_thread_per_radix[3] = 0;
18172 registers_per_thread_per_radix[5] = 10;
18173 registers_per_thread_per_radix[7] = 14;
18174 registers_per_thread_per_radix[11] = 11;
18175 registers_per_thread_per_radix[13] = 0;
18176 break;
18177 case 2:
18178 registers_per_thread_per_radix[2] = 10;
18179 registers_per_thread_per_radix[3] = 0;
18180 registers_per_thread_per_radix[5] = 10;
18181 registers_per_thread_per_radix[7] = 14;
18182 registers_per_thread_per_radix[11] = 11;
18183 registers_per_thread_per_radix[13] = 0;
18184 break;
18185 case 3:
18186 registers_per_thread_per_radix[2] = 8;
18187 registers_per_thread_per_radix[3] = 0;
18188 registers_per_thread_per_radix[5] = 10;
18189 registers_per_thread_per_radix[7] = 14;
18190 registers_per_thread_per_radix[11] = 11;
18191 registers_per_thread_per_radix[13] = 0;
18192 break;
18193 default:
18194 registers_per_thread_per_radix[2] = 16;
18195 registers_per_thread_per_radix[3] = 0;
18196 registers_per_thread_per_radix[5] = 10;
18197 registers_per_thread_per_radix[7] = 14;
18198 registers_per_thread_per_radix[11] = 11;
18199 registers_per_thread_per_radix[13] = 0;
18200 break;
18201 }
18202 }
18203 }
18204 else {
18205 if (loc_multipliers[13] > 0) {
18206 switch (loc_multipliers[2]) {
18207 case 1:
18208 registers_per_thread_per_radix[2] = 10;
18209 registers_per_thread_per_radix[3] = 0;
18210 registers_per_thread_per_radix[5] = 10;
18211 registers_per_thread_per_radix[7] = 14;
18212 registers_per_thread_per_radix[11] = 0;
18213 registers_per_thread_per_radix[13] = 13;
18214 break;
18215 case 2:
18216 registers_per_thread_per_radix[2] = 10;
18217 registers_per_thread_per_radix[3] = 0;
18218 registers_per_thread_per_radix[5] = 10;
18219 registers_per_thread_per_radix[7] = 14;
18220 registers_per_thread_per_radix[11] = 0;
18221 registers_per_thread_per_radix[13] = 13;
18222 break;
18223 case 3:
18224 registers_per_thread_per_radix[2] = 8;
18225 registers_per_thread_per_radix[3] = 0;
18226 registers_per_thread_per_radix[5] = 10;
18227 registers_per_thread_per_radix[7] = 14;
18228 registers_per_thread_per_radix[11] = 0;
18229 registers_per_thread_per_radix[13] = 13;
18230 break;
18231 default:
18232 registers_per_thread_per_radix[2] = 16;
18233 registers_per_thread_per_radix[3] = 0;
18234 registers_per_thread_per_radix[5] = 10;
18235 registers_per_thread_per_radix[7] = 14;
18236 registers_per_thread_per_radix[11] = 0;
18237 registers_per_thread_per_radix[13] = 13;
18238 break;
18239 }
18240 }
18241 else {
18242 switch (loc_multipliers[2]) {
18243 case 1:
18244 registers_per_thread_per_radix[2] = 10;
18245 registers_per_thread_per_radix[3] = 0;
18246 registers_per_thread_per_radix[5] = 10;
18247 registers_per_thread_per_radix[7] = 7;
18248 registers_per_thread_per_radix[11] = 0;
18249 registers_per_thread_per_radix[13] = 0;
18250 break;
18251 case 2:
18252 registers_per_thread_per_radix[2] = 10;
18253 registers_per_thread_per_radix[3] = 0;
18254 registers_per_thread_per_radix[5] = 10;
18255 registers_per_thread_per_radix[7] = 7;
18256 registers_per_thread_per_radix[11] = 0;
18257 registers_per_thread_per_radix[13] = 0;
18258 break;
18259 default:
18260 registers_per_thread_per_radix[2] = 8;
18261 registers_per_thread_per_radix[3] = 0;
18262 registers_per_thread_per_radix[5] = 10;
18263 registers_per_thread_per_radix[7] = 7;
18264 registers_per_thread_per_radix[11] = 0;
18265 registers_per_thread_per_radix[13] = 0;
18266 break;
18267 }
18268 }
18269 }
18270 }
18271 else {
18272 if (loc_multipliers[11] > 0) {
18273 if (loc_multipliers[13] > 0) {
18274 switch (loc_multipliers[2]) {
18275 case 1:
18276 registers_per_thread_per_radix[2] = 10;
18277 registers_per_thread_per_radix[3] = 0;
18278 registers_per_thread_per_radix[5] = 10;
18279 registers_per_thread_per_radix[7] = 0;
18280 registers_per_thread_per_radix[11] = 11;
18281 registers_per_thread_per_radix[13] = 13;
18282 break;
18283 case 2:
18284 registers_per_thread_per_radix[2] = 10;
18285 registers_per_thread_per_radix[3] = 0;
18286 registers_per_thread_per_radix[5] = 10;
18287 registers_per_thread_per_radix[7] = 0;
18288 registers_per_thread_per_radix[11] = 11;
18289 registers_per_thread_per_radix[13] = 13;
18290 break;
18291 default:
18292 registers_per_thread_per_radix[2] = 8;
18293 registers_per_thread_per_radix[3] = 0;
18294 registers_per_thread_per_radix[5] = 10;
18295 registers_per_thread_per_radix[7] = 0;
18296 registers_per_thread_per_radix[11] = 11;
18297 registers_per_thread_per_radix[13] = 13;
18298 break;
18299 }
18300 }
18301 else {
18302 switch (loc_multipliers[2]) {
18303 case 1:
18304 registers_per_thread_per_radix[2] = 10;
18305 registers_per_thread_per_radix[3] = 0;
18306 registers_per_thread_per_radix[5] = 10;
18307 registers_per_thread_per_radix[7] = 0;
18308 registers_per_thread_per_radix[11] = 11;
18309 registers_per_thread_per_radix[13] = 0;
18310 break;
18311 case 2:
18312 registers_per_thread_per_radix[2] = 10;
18313 registers_per_thread_per_radix[3] = 0;
18314 registers_per_thread_per_radix[5] = 10;
18315 registers_per_thread_per_radix[7] = 0;
18316 registers_per_thread_per_radix[11] = 11;
18317 registers_per_thread_per_radix[13] = 0;
18318 break;
18319 default:
18320 registers_per_thread_per_radix[2] = 8;
18321 registers_per_thread_per_radix[3] = 0;
18322 registers_per_thread_per_radix[5] = 10;
18323 registers_per_thread_per_radix[7] = 0;
18324 registers_per_thread_per_radix[11] = 11;
18325 registers_per_thread_per_radix[13] = 0;
18326 break;
18327 }
18328 }
18329 }
18330 else {
18331 if (loc_multipliers[13] > 0) {
18332 switch (loc_multipliers[2]) {
18333 case 1:
18334 registers_per_thread_per_radix[2] = 10;
18335 registers_per_thread_per_radix[3] = 0;
18336 registers_per_thread_per_radix[5] = 10;
18337 registers_per_thread_per_radix[7] = 0;
18338 registers_per_thread_per_radix[11] = 0;
18339 registers_per_thread_per_radix[13] = 13;
18340 break;
18341 case 2:
18342 registers_per_thread_per_radix[2] = 10;
18343 registers_per_thread_per_radix[3] = 0;
18344 registers_per_thread_per_radix[5] = 10;
18345 registers_per_thread_per_radix[7] = 0;
18346 registers_per_thread_per_radix[11] = 0;
18347 registers_per_thread_per_radix[13] = 13;
18348 break;
18349 default:
18350 registers_per_thread_per_radix[2] = 8;
18351 registers_per_thread_per_radix[3] = 0;
18352 registers_per_thread_per_radix[5] = 10;
18353 registers_per_thread_per_radix[7] = 0;
18354 registers_per_thread_per_radix[11] = 0;
18355 registers_per_thread_per_radix[13] = 13;
18356 break;
18357 }
18358 }
18359 else {
18360 switch (loc_multipliers[2]) {
18361 case 1:
18362 registers_per_thread_per_radix[2] = 10;
18363 registers_per_thread_per_radix[3] = 0;
18364 registers_per_thread_per_radix[5] = 10;
18365 registers_per_thread_per_radix[7] = 0;
18366 registers_per_thread_per_radix[11] = 0;
18367 registers_per_thread_per_radix[13] = 0;
18368 break;
18369 case 2:
18370 registers_per_thread_per_radix[2] = 10;
18371 registers_per_thread_per_radix[3] = 0;
18372 registers_per_thread_per_radix[5] = 10;
18373 registers_per_thread_per_radix[7] = 0;
18374 registers_per_thread_per_radix[11] = 0;
18375 registers_per_thread_per_radix[13] = 0;
18376 break;
18377 default:
18378 registers_per_thread_per_radix[2] = 8;
18379 registers_per_thread_per_radix[3] = 0;
18380 registers_per_thread_per_radix[5] = 10;
18381 registers_per_thread_per_radix[7] = 0;
18382 registers_per_thread_per_radix[11] = 0;
18383 registers_per_thread_per_radix[13] = 0;
18384 break;
18385 }
18386 }
18387 }
18388 }
18389 }
18390 else
18391 {
18392 if (loc_multipliers[7] > 0) {
18393 if (loc_multipliers[11] > 0) {
18394 if (loc_multipliers[13] > 0) {
18395 switch (loc_multipliers[2]) {
18396 case 1:
18397 registers_per_thread_per_radix[2] = 14;
18398 registers_per_thread_per_radix[3] = 0;
18399 registers_per_thread_per_radix[5] = 0;
18400 registers_per_thread_per_radix[7] = 14;
18401 registers_per_thread_per_radix[11] = 11;
18402 registers_per_thread_per_radix[13] = 13;
18403 break;
18404 case 2:
18405 registers_per_thread_per_radix[2] = 14;
18406 registers_per_thread_per_radix[3] = 0;
18407 registers_per_thread_per_radix[5] = 0;
18408 registers_per_thread_per_radix[7] = 14;
18409 registers_per_thread_per_radix[11] = 11;
18410 registers_per_thread_per_radix[13] = 13;
18411 break;
18412 case 3:
18413 registers_per_thread_per_radix[2] = 8;
18414 registers_per_thread_per_radix[3] = 0;
18415 registers_per_thread_per_radix[5] = 0;
18416 registers_per_thread_per_radix[7] = 14;
18417 registers_per_thread_per_radix[11] = 11;
18418 registers_per_thread_per_radix[13] = 13;
18419 break;
18420 default:
18421 registers_per_thread_per_radix[2] = 16;
18422 registers_per_thread_per_radix[3] = 0;
18423 registers_per_thread_per_radix[5] = 0;
18424 registers_per_thread_per_radix[7] = 14;
18425 registers_per_thread_per_radix[11] = 11;
18426 registers_per_thread_per_radix[13] = 13;
18427 break;
18428 }
18429 }
18430 else {
18431 switch (loc_multipliers[2]) {
18432 case 1:
18433 registers_per_thread_per_radix[2] = 14;
18434 registers_per_thread_per_radix[3] = 0;
18435 registers_per_thread_per_radix[5] = 0;
18436 registers_per_thread_per_radix[7] = 14;
18437 registers_per_thread_per_radix[11] = 11;
18438 registers_per_thread_per_radix[13] = 0;
18439 break;
18440 case 2:
18441 registers_per_thread_per_radix[2] = 14;
18442 registers_per_thread_per_radix[3] = 0;
18443 registers_per_thread_per_radix[5] = 0;
18444 registers_per_thread_per_radix[7] = 14;
18445 registers_per_thread_per_radix[11] = 11;
18446 registers_per_thread_per_radix[13] = 0;
18447 break;
18448 case 3:
18449 registers_per_thread_per_radix[2] = 8;
18450 registers_per_thread_per_radix[3] = 0;
18451 registers_per_thread_per_radix[5] = 0;
18452 registers_per_thread_per_radix[7] = 14;
18453 registers_per_thread_per_radix[11] = 11;
18454 registers_per_thread_per_radix[13] = 0;
18455 break;
18456 default:
18457 registers_per_thread_per_radix[2] = 16;
18458 registers_per_thread_per_radix[3] = 0;
18459 registers_per_thread_per_radix[5] = 0;
18460 registers_per_thread_per_radix[7] = 14;
18461 registers_per_thread_per_radix[11] = 11;
18462 registers_per_thread_per_radix[13] = 0;
18463 break;
18464 }
18465 }
18466 }
18467 else {
18468 if (loc_multipliers[13] > 0) {
18469 switch (loc_multipliers[2]) {
18470 case 1:
18471 registers_per_thread_per_radix[2] = 14;
18472 registers_per_thread_per_radix[3] = 0;
18473 registers_per_thread_per_radix[5] = 0;
18474 registers_per_thread_per_radix[7] = 14;
18475 registers_per_thread_per_radix[11] = 0;
18476 registers_per_thread_per_radix[13] = 13;
18477 break;
18478 case 2:
18479 registers_per_thread_per_radix[2] = 14;
18480 registers_per_thread_per_radix[3] = 0;
18481 registers_per_thread_per_radix[5] = 0;
18482 registers_per_thread_per_radix[7] = 14;
18483 registers_per_thread_per_radix[11] = 0;
18484 registers_per_thread_per_radix[13] = 13;
18485 break;
18486 case 3:
18487 registers_per_thread_per_radix[2] = 8;
18488 registers_per_thread_per_radix[3] = 0;
18489 registers_per_thread_per_radix[5] = 0;
18490 registers_per_thread_per_radix[7] = 14;
18491 registers_per_thread_per_radix[11] = 0;
18492 registers_per_thread_per_radix[13] = 13;
18493 break;
18494 default:
18495 registers_per_thread_per_radix[2] = 16;
18496 registers_per_thread_per_radix[3] = 0;
18497 registers_per_thread_per_radix[5] = 0;
18498 registers_per_thread_per_radix[7] = 14;
18499 registers_per_thread_per_radix[11] = 0;
18500 registers_per_thread_per_radix[13] = 13;
18501 break;
18502 }
18503 }
18504 else {
18505 switch (loc_multipliers[2]) {
18506 case 1:
18507 registers_per_thread_per_radix[2] = 14;
18508 registers_per_thread_per_radix[3] = 0;
18509 registers_per_thread_per_radix[5] = 0;
18510 registers_per_thread_per_radix[7] = 14;
18511 registers_per_thread_per_radix[11] = 0;
18512 registers_per_thread_per_radix[13] = 0;
18513 break;
18514 case 2:
18515 registers_per_thread_per_radix[2] = 14;
18516 registers_per_thread_per_radix[3] = 0;
18517 registers_per_thread_per_radix[5] = 0;
18518 registers_per_thread_per_radix[7] = 14;
18519 registers_per_thread_per_radix[11] = 0;
18520 registers_per_thread_per_radix[13] = 0;
18521 break;
18522 case 3:
18523 registers_per_thread_per_radix[2] = 14;
18524 registers_per_thread_per_radix[3] = 0;
18525 registers_per_thread_per_radix[5] = 0;
18526 registers_per_thread_per_radix[7] = 14;
18527 registers_per_thread_per_radix[11] = 0;
18528 registers_per_thread_per_radix[13] = 0;
18529 break;
18530 default:
18531 registers_per_thread_per_radix[2] = 14;
18532 registers_per_thread_per_radix[3] = 0;
18533 registers_per_thread_per_radix[5] = 0;
18534 registers_per_thread_per_radix[7] = 14;
18535 registers_per_thread_per_radix[11] = 0;
18536 registers_per_thread_per_radix[13] = 0;
18537 break;
18538 }
18539 }
18540 }
18541 }
18542 else {
18543 if (loc_multipliers[11] > 0) {
18544 if (loc_multipliers[13] > 0) {
18545 switch (loc_multipliers[2]) {
18546 case 1:
18547 registers_per_thread_per_radix[2] = 22;
18548 registers_per_thread_per_radix[3] = 0;
18549 registers_per_thread_per_radix[5] = 0;
18550 registers_per_thread_per_radix[7] = 0;
18551 registers_per_thread_per_radix[11] = 22;
18552 registers_per_thread_per_radix[13] = 26;
18553 break;
18554 case 2:
18555 registers_per_thread_per_radix[2] = 22;
18556 registers_per_thread_per_radix[3] = 0;
18557 registers_per_thread_per_radix[5] = 0;
18558 registers_per_thread_per_radix[7] = 0;
18559 registers_per_thread_per_radix[11] = 22;
18560 registers_per_thread_per_radix[13] = 26;
18561 break;
18562 default:
18563 registers_per_thread_per_radix[2] = 8;
18564 registers_per_thread_per_radix[3] = 0;
18565 registers_per_thread_per_radix[5] = 0;
18566 registers_per_thread_per_radix[7] = 0;
18567 registers_per_thread_per_radix[11] = 11;
18568 registers_per_thread_per_radix[13] = 13;
18569 break;
18570 }
18571 }
18572 else {
18573 switch (loc_multipliers[2]) {
18574 case 1:
18575 registers_per_thread_per_radix[2] = 22;
18576 registers_per_thread_per_radix[3] = 0;
18577 registers_per_thread_per_radix[5] = 0;
18578 registers_per_thread_per_radix[7] = 0;
18579 registers_per_thread_per_radix[11] = 22;
18580 registers_per_thread_per_radix[13] = 0;
18581 break;
18582 case 2:
18583 registers_per_thread_per_radix[2] = 22;
18584 registers_per_thread_per_radix[3] = 0;
18585 registers_per_thread_per_radix[5] = 0;
18586 registers_per_thread_per_radix[7] = 0;
18587 registers_per_thread_per_radix[11] = 22;
18588 registers_per_thread_per_radix[13] = 0;
18589 break;
18590 case 3:
18591 registers_per_thread_per_radix[2] = 8;
18592 registers_per_thread_per_radix[3] = 0;
18593 registers_per_thread_per_radix[5] = 0;
18594 registers_per_thread_per_radix[7] = 0;
18595 registers_per_thread_per_radix[11] = 11;
18596 registers_per_thread_per_radix[13] = 0;
18597 break;
18598 default:
18599 registers_per_thread_per_radix[2] = 8;
18600 registers_per_thread_per_radix[3] = 0;
18601 registers_per_thread_per_radix[5] = 0;
18602 registers_per_thread_per_radix[7] = 0;
18603 registers_per_thread_per_radix[11] = 11;
18604 registers_per_thread_per_radix[13] = 0;
18605 break;
18606 }
18607 }
18608 }
18609 else {
18610 if (loc_multipliers[13] > 0) {
18611 switch (loc_multipliers[2]) {
18612 case 1:
18613 registers_per_thread_per_radix[2] = 26;
18614 registers_per_thread_per_radix[3] = 0;
18615 registers_per_thread_per_radix[5] = 0;
18616 registers_per_thread_per_radix[7] = 0;
18617 registers_per_thread_per_radix[11] = 0;
18618 registers_per_thread_per_radix[13] = 26;
18619 break;
18620 case 2:
18621 registers_per_thread_per_radix[2] = 26;
18622 registers_per_thread_per_radix[3] = 0;
18623 registers_per_thread_per_radix[5] = 0;
18624 registers_per_thread_per_radix[7] = 0;
18625 registers_per_thread_per_radix[11] = 0;
18626 registers_per_thread_per_radix[13] = 26;
18627 break;
18628 default:
18629 registers_per_thread_per_radix[2] = 8;
18630 registers_per_thread_per_radix[3] = 0;
18631 registers_per_thread_per_radix[5] = 0;
18632 registers_per_thread_per_radix[7] = 0;
18633 registers_per_thread_per_radix[11] = 0;
18634 registers_per_thread_per_radix[13] = 13;
18635 break;
18636 }
18637 }
18638 else {
18639 registers_per_thread_per_radix[2] = (loc_multipliers[2] > 2) ? 8 : (uint64_t)pow(2, loc_multipliers[2]);
18640 registers_per_thread_per_radix[3] = 0;
18641 registers_per_thread_per_radix[5] = 0;
18642 registers_per_thread_per_radix[7] = 0;
18643 registers_per_thread_per_radix[11] = 0;
18644 registers_per_thread_per_radix[13] = 0;
18645 }
18646 }
18647 }
18648 }
18649 }
18650 }
18651 else {
18652 if (loc_multipliers[3] > 0) {
18653 if (loc_multipliers[5] > 0) {
18654 if (loc_multipliers[7] > 0) {
18655 if (loc_multipliers[11] > 0) {
18656 if (loc_multipliers[13] > 0) {
18657 registers_per_thread_per_radix[2] = 0;
18658 registers_per_thread_per_radix[3] = 15;
18659 registers_per_thread_per_radix[5] = 15;
18660 registers_per_thread_per_radix[7] = 21;
18661 registers_per_thread_per_radix[11] = 11;
18662 registers_per_thread_per_radix[13] = 13;
18663 }
18664 else {
18665 registers_per_thread_per_radix[2] = 0;
18666 registers_per_thread_per_radix[3] = 15;
18667 registers_per_thread_per_radix[5] = 15;
18668 registers_per_thread_per_radix[7] = 21;
18669 registers_per_thread_per_radix[11] = 11;
18670 registers_per_thread_per_radix[13] = 0;
18671 }
18672 }
18673 else {
18674 if (loc_multipliers[13] > 0) {
18675 registers_per_thread_per_radix[2] = 0;
18676 registers_per_thread_per_radix[3] = 15;
18677 registers_per_thread_per_radix[5] = 15;
18678 registers_per_thread_per_radix[7] = 21;
18679 registers_per_thread_per_radix[11] = 0;
18680 registers_per_thread_per_radix[13] = 13;
18681 }
18682 else {
18683 registers_per_thread_per_radix[2] = 0;
18684 registers_per_thread_per_radix[3] = 15;
18685 registers_per_thread_per_radix[5] = 15;
18686 registers_per_thread_per_radix[7] = 21;
18687 registers_per_thread_per_radix[11] = 0;
18688 registers_per_thread_per_radix[13] = 0;
18689 }
18690 }
18691 }
18692 else {
18693 if (loc_multipliers[11] > 0) {
18694 if (loc_multipliers[13] > 0) {
18695 registers_per_thread_per_radix[2] = 0;
18696 registers_per_thread_per_radix[3] = 15;
18697 registers_per_thread_per_radix[5] = 15;
18698 registers_per_thread_per_radix[7] = 0;
18699 registers_per_thread_per_radix[11] = 11;
18700 registers_per_thread_per_radix[13] = 13;
18701 }
18702 else {
18703 registers_per_thread_per_radix[2] = 0;
18704 registers_per_thread_per_radix[3] = 15;
18705 registers_per_thread_per_radix[5] = 15;
18706 registers_per_thread_per_radix[7] = 0;
18707 registers_per_thread_per_radix[11] = 11;
18708 registers_per_thread_per_radix[13] = 0;
18709 }
18710 }
18711 else {
18712 if (loc_multipliers[13] > 0) {
18713 registers_per_thread_per_radix[2] = 0;
18714 registers_per_thread_per_radix[3] = 15;
18715 registers_per_thread_per_radix[5] = 15;
18716 registers_per_thread_per_radix[7] = 0;
18717 registers_per_thread_per_radix[11] = 0;
18718 registers_per_thread_per_radix[13] = 13;
18719 }
18720 else {
18721 registers_per_thread_per_radix[2] = 0;
18722 registers_per_thread_per_radix[3] = 15;
18723 registers_per_thread_per_radix[5] = 15;
18724 registers_per_thread_per_radix[7] = 0;
18725 registers_per_thread_per_radix[11] = 0;
18726 registers_per_thread_per_radix[13] = 0;
18727 }
18728 }
18729 }
18730 }
18731 else
18732 {
18733 if (loc_multipliers[7] > 0) {
18734 if (loc_multipliers[3] == 1) {
18735 if (loc_multipliers[11] > 0) {
18736 if (loc_multipliers[13] > 0) {
18737 registers_per_thread_per_radix[2] = 0;
18738 registers_per_thread_per_radix[3] = 21;
18739 registers_per_thread_per_radix[5] = 0;
18740 registers_per_thread_per_radix[7] = 21;
18741 registers_per_thread_per_radix[11] = 11;
18742 registers_per_thread_per_radix[13] = 13;
18743 }
18744 else {
18745 registers_per_thread_per_radix[2] = 0;
18746 registers_per_thread_per_radix[3] = 21;
18747 registers_per_thread_per_radix[5] = 0;
18748 registers_per_thread_per_radix[7] = 21;
18749 registers_per_thread_per_radix[11] = 11;
18750 registers_per_thread_per_radix[13] = 0;
18751 }
18752 }
18753 else {
18754 if (loc_multipliers[13] > 0) {
18755 registers_per_thread_per_radix[2] = 0;
18756 registers_per_thread_per_radix[3] = 21;
18757 registers_per_thread_per_radix[5] = 0;
18758 registers_per_thread_per_radix[7] = 21;
18759 registers_per_thread_per_radix[11] = 0;
18760 registers_per_thread_per_radix[13] = 13;
18761 }
18762 else {
18763 registers_per_thread_per_radix[2] = 0;
18764 registers_per_thread_per_radix[3] = 21;
18765 registers_per_thread_per_radix[5] = 0;
18766 registers_per_thread_per_radix[7] = 21;
18767 registers_per_thread_per_radix[11] = 0;
18768 registers_per_thread_per_radix[13] = 0;
18769 }
18770 }
18771 }
18772 else {
18773 if (loc_multipliers[11] > 0) {
18774 if (loc_multipliers[13] > 0) {
18775 registers_per_thread_per_radix[2] = 0;
18776 registers_per_thread_per_radix[3] = 9;
18777 registers_per_thread_per_radix[5] = 0;
18778 registers_per_thread_per_radix[7] = 7;
18779 registers_per_thread_per_radix[11] = 11;
18780 registers_per_thread_per_radix[13] = 13;
18781 }
18782 else {
18783 registers_per_thread_per_radix[2] = 0;
18784 registers_per_thread_per_radix[3] = 9;
18785 registers_per_thread_per_radix[5] = 0;
18786 registers_per_thread_per_radix[7] = 7;
18787 registers_per_thread_per_radix[11] = 11;
18788 registers_per_thread_per_radix[13] = 0;
18789 }
18790 }
18791 else {
18792 if (loc_multipliers[13] > 0) {
18793 registers_per_thread_per_radix[2] = 0;
18794 registers_per_thread_per_radix[3] = 9;
18795 registers_per_thread_per_radix[5] = 0;
18796 registers_per_thread_per_radix[7] = 7;
18797 registers_per_thread_per_radix[11] = 0;
18798 registers_per_thread_per_radix[13] = 13;
18799 }
18800 else {
18801 registers_per_thread_per_radix[2] = 0;
18802 registers_per_thread_per_radix[3] = 9;
18803 registers_per_thread_per_radix[5] = 0;
18804 registers_per_thread_per_radix[7] = 7;
18805 registers_per_thread_per_radix[11] = 0;
18806 registers_per_thread_per_radix[13] = 0;
18807 }
18808 }
18809 }
18810 }
18811 else {
18812 if (loc_multipliers[3] == 1) {
18813 if (loc_multipliers[11] > 0) {
18814 if (loc_multipliers[13] > 0) {
18815 registers_per_thread_per_radix[2] = 0;
18816 registers_per_thread_per_radix[3] = 33;
18817 registers_per_thread_per_radix[5] = 0;
18818 registers_per_thread_per_radix[7] = 0;
18819 registers_per_thread_per_radix[11] = 33;
18820 registers_per_thread_per_radix[13] = 39;
18821 }
18822 else {
18823 registers_per_thread_per_radix[2] = 0;
18824 registers_per_thread_per_radix[3] = 33;
18825 registers_per_thread_per_radix[5] = 0;
18826 registers_per_thread_per_radix[7] = 0;
18827 registers_per_thread_per_radix[11] = 33;
18828 registers_per_thread_per_radix[13] = 0;
18829 }
18830 }
18831 else {
18832 if (loc_multipliers[13] > 0) {
18833 registers_per_thread_per_radix[2] = 0;
18834 registers_per_thread_per_radix[3] = 39;
18835 registers_per_thread_per_radix[5] = 0;
18836 registers_per_thread_per_radix[7] = 0;
18837 registers_per_thread_per_radix[11] = 0;
18838 registers_per_thread_per_radix[13] = 39;
18839 }
18840 else {
18841 registers_per_thread_per_radix[2] = 0;
18842 registers_per_thread_per_radix[3] = 3;
18843 registers_per_thread_per_radix[5] = 0;
18844 registers_per_thread_per_radix[7] = 0;
18845 registers_per_thread_per_radix[11] = 0;
18846 registers_per_thread_per_radix[13] = 0;
18847 }
18848 }
18849 }
18850 else {
18851 if (loc_multipliers[11] > 0) {
18852 if (loc_multipliers[13] > 0) {
18853 registers_per_thread_per_radix[2] = 0;
18854 registers_per_thread_per_radix[3] = 9;
18855 registers_per_thread_per_radix[5] = 0;
18856 registers_per_thread_per_radix[7] = 0;
18857 registers_per_thread_per_radix[11] = 11;
18858 registers_per_thread_per_radix[13] = 13;
18859 }
18860 else {
18861 registers_per_thread_per_radix[2] = 0;
18862 registers_per_thread_per_radix[3] = 9;
18863 registers_per_thread_per_radix[5] = 0;
18864 registers_per_thread_per_radix[7] = 0;
18865 registers_per_thread_per_radix[11] = 11;
18866 registers_per_thread_per_radix[13] = 0;
18867 }
18868 }
18869 else {
18870 if (loc_multipliers[13] > 0) {
18871 registers_per_thread_per_radix[2] = 0;
18872 registers_per_thread_per_radix[3] = 9;
18873 registers_per_thread_per_radix[5] = 0;
18874 registers_per_thread_per_radix[7] = 0;
18875 registers_per_thread_per_radix[11] = 0;
18876 registers_per_thread_per_radix[13] = 13;
18877 }
18878 else {
18879 registers_per_thread_per_radix[2] = 0;
18880 registers_per_thread_per_radix[3] = 9;
18881 registers_per_thread_per_radix[5] = 0;
18882 registers_per_thread_per_radix[7] = 0;
18883 registers_per_thread_per_radix[11] = 0;
18884 registers_per_thread_per_radix[13] = 0;
18885 }
18886 }
18887 }
18888 }
18889 }
18890 }
18891 else {
18892 if (loc_multipliers[5] > 0) {
18893 if (loc_multipliers[7] > 0) {
18894 if (loc_multipliers[11] > 0) {
18895 if (loc_multipliers[13] > 0) {
18896 registers_per_thread_per_radix[2] = 0;
18897 registers_per_thread_per_radix[3] = 0;
18898 registers_per_thread_per_radix[5] = 5;
18899 registers_per_thread_per_radix[7] = 7;
18900 registers_per_thread_per_radix[11] = 11;
18901 registers_per_thread_per_radix[13] = 13;
18902 }
18903 else {
18904 registers_per_thread_per_radix[2] = 0;
18905 registers_per_thread_per_radix[3] = 0;
18906 registers_per_thread_per_radix[5] = 5;
18907 registers_per_thread_per_radix[7] = 7;
18908 registers_per_thread_per_radix[11] = 11;
18909 registers_per_thread_per_radix[13] = 0;
18910 }
18911 }
18912 else {
18913 if (loc_multipliers[13] > 0) {
18914 registers_per_thread_per_radix[2] = 0;
18915 registers_per_thread_per_radix[3] = 0;
18916 registers_per_thread_per_radix[5] = 5;
18917 registers_per_thread_per_radix[7] = 7;
18918 registers_per_thread_per_radix[11] = 0;
18919 registers_per_thread_per_radix[13] = 13;
18920 }
18921 else {
18922 registers_per_thread_per_radix[2] = 0;
18923 registers_per_thread_per_radix[3] = 0;
18924 registers_per_thread_per_radix[5] = 5;
18925 registers_per_thread_per_radix[7] = 7;
18926 registers_per_thread_per_radix[11] = 0;
18927 registers_per_thread_per_radix[13] = 0;
18928 }
18929 }
18930 }
18931 else {
18932 if (loc_multipliers[11] > 0) {
18933 if (loc_multipliers[13] > 0) {
18934 registers_per_thread_per_radix[2] = 0;
18935 registers_per_thread_per_radix[3] = 0;
18936 registers_per_thread_per_radix[5] = 5;
18937 registers_per_thread_per_radix[7] = 0;
18938 registers_per_thread_per_radix[11] = 11;
18939 registers_per_thread_per_radix[13] = 13;
18940 }
18941 else {
18942 registers_per_thread_per_radix[2] = 0;
18943 registers_per_thread_per_radix[3] = 0;
18944 registers_per_thread_per_radix[5] = 5;
18945 registers_per_thread_per_radix[7] = 0;
18946 registers_per_thread_per_radix[11] = 11;
18947 registers_per_thread_per_radix[13] = 0;
18948 }
18949 }
18950 else {
18951 if (loc_multipliers[13] > 0) {
18952 registers_per_thread_per_radix[2] = 0;
18953 registers_per_thread_per_radix[3] = 0;
18954 registers_per_thread_per_radix[5] = 5;
18955 registers_per_thread_per_radix[7] = 0;
18956 registers_per_thread_per_radix[11] = 0;
18957 registers_per_thread_per_radix[13] = 13;
18958 }
18959 else {
18960 registers_per_thread_per_radix[2] = 0;
18961 registers_per_thread_per_radix[3] = 0;
18962 registers_per_thread_per_radix[5] = 5;
18963 registers_per_thread_per_radix[7] = 0;
18964 registers_per_thread_per_radix[11] = 0;
18965 registers_per_thread_per_radix[13] = 0;
18966 }
18967 }
18968 }
18969 }
18970 else
18971 {
18972 if (loc_multipliers[7] > 0) {
18973 if (loc_multipliers[11] > 0) {
18974 if (loc_multipliers[13] > 0) {
18975 registers_per_thread_per_radix[2] = 0;
18976 registers_per_thread_per_radix[3] = 0;
18977 registers_per_thread_per_radix[5] = 0;
18978 registers_per_thread_per_radix[7] = 7;
18979 registers_per_thread_per_radix[11] = 11;
18980 registers_per_thread_per_radix[13] = 13;
18981 }
18982 else {
18983 registers_per_thread_per_radix[2] = 0;
18984 registers_per_thread_per_radix[3] = 0;
18985 registers_per_thread_per_radix[5] = 0;
18986 registers_per_thread_per_radix[7] = 7;
18987 registers_per_thread_per_radix[11] = 11;
18988 registers_per_thread_per_radix[13] = 0;
18989 }
18990 }
18991 else {
18992 if (loc_multipliers[13] > 0) {
18993 registers_per_thread_per_radix[2] = 0;
18994 registers_per_thread_per_radix[3] = 0;
18995 registers_per_thread_per_radix[5] = 0;
18996 registers_per_thread_per_radix[7] = 7;
18997 registers_per_thread_per_radix[11] = 0;
18998 registers_per_thread_per_radix[13] = 13;
18999 }
19000 else {
19001 registers_per_thread_per_radix[2] = 0;
19002 registers_per_thread_per_radix[3] = 0;
19003 registers_per_thread_per_radix[5] = 0;
19004 registers_per_thread_per_radix[7] = 7;
19005 registers_per_thread_per_radix[11] = 0;
19006 registers_per_thread_per_radix[13] = 0;
19007 }
19008 }
19009 }
19010 else {
19011 if (loc_multipliers[11] > 0) {
19012 if (loc_multipliers[13] > 0) {
19013 registers_per_thread_per_radix[2] = 0;
19014 registers_per_thread_per_radix[3] = 0;
19015 registers_per_thread_per_radix[5] = 0;
19016 registers_per_thread_per_radix[7] = 0;
19017 registers_per_thread_per_radix[11] = 11;
19018 registers_per_thread_per_radix[13] = 13;
19019 }
19020 else {
19021 registers_per_thread_per_radix[2] = 0;
19022 registers_per_thread_per_radix[3] = 0;
19023 registers_per_thread_per_radix[5] = 0;
19024 registers_per_thread_per_radix[7] = 0;
19025 registers_per_thread_per_radix[11] = 11;
19026 registers_per_thread_per_radix[13] = 0;
19027 }
19028 }
19029 else {
19030 if (loc_multipliers[13] > 0) {
19031 registers_per_thread_per_radix[2] = 0;
19032 registers_per_thread_per_radix[3] = 0;
19033 registers_per_thread_per_radix[5] = 0;
19034 registers_per_thread_per_radix[7] = 0;
19035 registers_per_thread_per_radix[11] = 0;
19036 registers_per_thread_per_radix[13] = 13;
19037 }
19038 else {
19040 }
19041 }
19042 }
19043 }
19044 }
19045
19046 }
19047 for (uint64_t i = 0; i < 14; i++) {
19048 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i];
19049 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i];
19050 }
19051 if ((registers_per_thread[0] > 10) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0;
19052 else isGoodSequence[0] = 1;
19053 return VKFFT_SUCCESS;
19054}
19055static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t supportAxis) {
19057 VkFFTAxis* axes = FFTPlan->axes[axis_id];
19058
19059 uint64_t complexSize;
19061 complexSize = (2 * sizeof(double));
19062 else
19064 complexSize = (2 * sizeof(float));
19065 else
19066 complexSize = (2 * sizeof(float));
19067 uint64_t maxSequenceLengthSharedMemory = app->configuration.sharedMemorySize / complexSize;
19068 uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19069 uint64_t nonStridedAxisId = (app->configuration.considerAllAxesStrided) ? -1 : 0;
19070 for (uint64_t i = 0; i < 3; i++) {
19071 FFTPlan->actualFFTSizePerAxis[axis_id][i] = app->configuration.size[i];
19072 }
19073 FFTPlan->actualPerformR2CPerAxis[axis_id] = app->configuration.performR2C;
19074 if ((axis_id == 0) && (app->configuration.performR2C) && (app->configuration.size[axis_id] > maxSingleSizeNonStrided)) {
19075 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT
19076 FFTPlan->actualPerformR2CPerAxis[axis_id] = 0;
19077 FFTPlan->multiUploadR2C = 1;
19078 }
19079 if (app->configuration.performDCT == 1) {
19080 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = 2 * app->configuration.size[axis_id] - 2; // now in actualFFTSize - modified dimension size for R2C/DCT
19081 }
19082 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
19083 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT
19084 //FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] * 8; // now in actualFFTSize - modified dimension size for R2C/DCT
19085 }
19086 if ((axis_id > 0) && (app->configuration.performR2C)) {
19087 FFTPlan->actualFFTSizePerAxis[axis_id][0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / 2 + 1;
19088 }
19089 if (axis_id != nonStridedAxisId) {
19092 }
19093 uint64_t multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };//split the sequence
19094 uint64_t tempSequence = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
19095 for (uint64_t i = 2; i < 14; i++) {
19096 if (tempSequence % i == 0) {
19097 tempSequence /= i;
19098 multipliers[i]++;
19099 i--;
19100 }
19101 }
19102 if (tempSequence != 1) {
19103 app->useBluesteinFFT[axis_id] = 1;
19104 if (axis_id != nonStridedAxisId) {
19107 }
19109 tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1;
19110 uint64_t FFTSizeSelected = 0;
19111 if (app->configuration.fixMaxRadixBluestein > 0) {
19112 while (!FFTSizeSelected) {
19113 uint64_t testSequence = tempSequence;
19114 for (uint64_t i = 0; i < 20; i++) {
19115 multipliers[i] = 0;
19116 }
19117 for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) {
19118 if (testSequence % i == 0) {
19119 testSequence /= i;
19120 multipliers[i]++;
19121 i--;
19122 }
19123 }
19124 if (testSequence == 1) FFTSizeSelected = 1;
19125 else tempSequence++;
19126 }
19127 }
19128 else {
19129 while (!FFTSizeSelected) {
19130 if (axis_id == nonStridedAxisId) {
19131 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19132 }
19133 else {
19134 uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySize / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySize / complexSize;
19135 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19136 }
19137 uint64_t testSequence = tempSequence;
19138 for (uint64_t i = 0; i < 20; i++) {
19139 multipliers[i] = 0;
19140 }
19141 for (uint64_t i = 2; i < 8; i++) {
19142 if (testSequence % i == 0) {
19143 testSequence /= i;
19144 multipliers[i]++;
19145 i--;
19146 }
19147 }
19148 if (testSequence != 1) tempSequence++;
19149 else {
19150 uint64_t registers_per_thread_per_radix[14];
19151 uint64_t registers_per_thread = 0;
19152 uint64_t min_registers_per_thread = -1;
19153 uint64_t isGoodSequence = 0;
19154 res = VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, &registers_per_thread, &min_registers_per_thread, &isGoodSequence);
19155 if (res != VKFFT_SUCCESS) return res;
19156 if (isGoodSequence) FFTSizeSelected = 1;
19157 else tempSequence++;
19158 }
19159 }
19160 }
19161 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence;
19162 //check if padded system still single upload for r2c - else redo the optimization
19163 if ((axis_id == 0) && (app->configuration.performR2C) && (!FFTPlan->multiUploadR2C) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] > maxSingleSizeNonStrided)) {
19164 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT
19165 FFTPlan->actualPerformR2CPerAxis[axis_id] = 0;
19166 FFTPlan->multiUploadR2C = 1;
19167 tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1;
19168 uint64_t FFTSizeSelected = 0;
19169 if (app->configuration.fixMaxRadixBluestein > 0) {
19170 while (!FFTSizeSelected) {
19171 uint64_t testSequence = tempSequence;
19172 for (uint64_t i = 0; i < 20; i++) {
19173 multipliers[i] = 0;
19174 }
19175 for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) {
19176 if (testSequence % i == 0) {
19177 testSequence /= i;
19178 multipliers[i]++;
19179 i--;
19180 }
19181 }
19182 if (testSequence == 1) FFTSizeSelected = 1;
19183 else tempSequence++;
19184 }
19185 }
19186 else {
19187 while (!FFTSizeSelected) {
19188 if (axis_id == nonStridedAxisId) {
19189 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19190 }
19191 else {
19192 uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySize / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySize / complexSize;
19193 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19194 }
19195 uint64_t testSequence = tempSequence;
19196 for (uint64_t i = 0; i < 20; i++) {
19197 multipliers[i] = 0;
19198 }
19199 for (uint64_t i = 2; i < 8; i++) {
19200 if (testSequence % i == 0) {
19201 testSequence /= i;
19202 multipliers[i]++;
19203 i--;
19204 }
19205 }
19206 if (testSequence != 1) tempSequence++;
19207 else {
19208 uint64_t registers_per_thread_per_radix[14];
19209 uint64_t registers_per_thread = 0;
19210 uint64_t min_registers_per_thread = -1;
19211 uint64_t isGoodSequence = 0;
19212 res = VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, &registers_per_thread, &min_registers_per_thread, &isGoodSequence);
19213 if (res != VKFFT_SUCCESS) return res;
19214 if (isGoodSequence) FFTSizeSelected = 1;
19215 else tempSequence++;
19216 }
19217 }
19218 }
19219 FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence;
19220 }
19221 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) == 0) {
19223 maxSequenceLengthSharedMemory = app->configuration.sharedMemorySize / complexSize;
19224 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19225 }
19226 }
19227 uint64_t isPowOf2 = (pow(2, (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])) == FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) ? 1 : 0;
19228 if (app->configuration.tempBufferSize[0] == 0) {
19229 if ((app->configuration.performR2C) && (axis_id == 0)) {
19230 if (FFTPlan->multiUploadR2C)
19231 app->configuration.tempBufferSize[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize;
19232 }
19233 else {
19234 app->configuration.tempBufferSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize;
19235 }
19236 }
19237 if (app->useBluesteinFFT[axis_id]) {
19238 if ((app->configuration.performR2C) && (axis_id == 0)) {
19239 if (FFTPlan->multiUploadR2C) {
19240 if ((FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize > app->configuration.tempBufferSize[0]) app->configuration.tempBufferSize[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize;
19241 }
19242 }
19243 else {
19244 if (FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize > app->configuration.tempBufferSize[0]) app->configuration.tempBufferSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * complexSize;
19245 }
19246 }
19247 //return VKFFT_ERROR_UNSUPPORTED_RADIX;
19248 uint64_t registerBoost = 1;
19249 for (uint64_t i = 1; i <= app->configuration.registerBoost; i++) {
19250 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0)
19251 registerBoost = i;
19252 }
19253 if ((axis_id == nonStridedAxisId) && (!app->configuration.performConvolution)) maxSingleSizeNonStrided *= registerBoost;
19254 uint64_t maxSequenceLengthSharedMemoryStrided = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySize / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySize / complexSize;
19255 uint64_t maxSingleSizeStrided = (!app->configuration.performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided;
19256 uint64_t numPasses = 1;
19257 uint64_t numPassesHalfBandwidth = 1;
19258 uint64_t temp;
19259 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStrided);
19260 if (temp > 1) {//more passes than one
19261 for (uint64_t i = 1; i <= app->configuration.registerBoost4Step; i++) {
19262 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) {
19263 registerBoost = i;
19264 }
19265 }
19266 if ((!app->configuration.performConvolution)) maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost;
19267 if ((!app->configuration.performConvolution)) maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
19268 temp = ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided;
19269 if (app->configuration.reorderFourStep && (!app->useBluesteinFFT[axis_id]))
19270 numPasses = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
19271 else
19272 numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided));
19273 }
19274 registerBoost = ((axis_id == nonStridedAxisId) && ((app->useBluesteinFFT[axis_id]) || (!app->configuration.reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)pow(maxSequenceLengthSharedMemoryStrided, numPasses));
19275 uint64_t canBoost = 0;
19276 for (uint64_t i = registerBoost; i <= app->configuration.registerBoost; i++) {
19277 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) {
19278 registerBoost = i;
19279 i = app->configuration.registerBoost + 1;
19280 canBoost = 1;
19281 }
19282 }
19283 if (((canBoost == 0) || (((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) != 0) && (!app->configuration.registerBoostNonPow2))) && (registerBoost > 1)) {
19284 registerBoost = 1;
19285 numPasses++;
19286 }
19287 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost;
19288 maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
19289 uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19292 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
19293 //temp = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided;
19294 if (temp > 1) {//more passes than two
19295 temp = ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id])) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth);
19296 for (uint64_t i = 0; i < 5; i++) {
19297 temp = (uint64_t)ceil(temp / (double)maxSingleSizeStrided);
19298 numPassesHalfBandwidth++;
19299 if (temp == 1) i = 5;
19300 }
19301 /*
19302 temp = ((axis_id == 0) && (!app->configuration.reorderFourStep)) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStridedHalfBandwidth;
19303
19304 if (app->configuration.reorderFourStep)
19305 numPassesHalfBandwidth = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStridedHalfBandwidth));
19306 else
19307 numPassesHalfBandwidth = 1 + (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStridedHalfBandwidth));
19308 if ((numPassesHalfBandwidth == 2)&& (!app->configuration.reorderFourStep)&&(registerBoost>1)) //switch back for two step and don't do half bandwidth on strided accesses if register boost and no 4-step reordering
19309 */
19310 }
19311 if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth;
19312 else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19313 }
19314 if (((uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 17)) numPasses = 3;//Force set to 3 stage 4 step algorithm
19315 uint64_t* locAxisSplit = FFTPlan->axisSplit[axis_id];
19316 if (numPasses == 1) {
19317 locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
19318 }
19319 if (numPasses == 2) {
19320 if (isPowOf2) {
19321 if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) {
19322 uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19323 //unit stride
19324 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) {
19325 locAxisSplit[0] = maxPow8SharedMemory;
19326 }
19327 else {
19328 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) {
19329 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19330 }
19331 else {
19332 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) {
19333 for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) {
19334 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided) {
19335 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
19336 i = (uint64_t)log2(registerBoost) + 1;
19337 }
19338 }
19339 }
19340 else {
19341 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19342 }
19343 }
19344 }
19345 }
19346 else {
19347 uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19348 //all FFTs are considered as non-unit stride
19349 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) {
19350 locAxisSplit[0] = maxPow8Strided;
19351 }
19352 else {
19353 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) {
19354 locAxisSplit[0] = maxSingleSizeStrided;
19355 }
19356 else {
19357 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19358 }
19359 }
19360 }
19361 locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0];
19362 if (locAxisSplit[1] < 64) {
19363 locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]);
19364 locAxisSplit[1] = 64;
19365 }
19366 if (locAxisSplit[1] > locAxisSplit[0]) {
19367 uint64_t swap = locAxisSplit[0];
19368 locAxisSplit[0] = locAxisSplit[1];
19369 locAxisSplit[1] = swap;
19370 }
19371 }
19372 else {
19373 uint64_t successSplit = 0;
19374 if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) {
19375 /*for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) {
19376 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
19377 if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) <= maxSingleSizeStrided)) {
19378 locAxisSplit[0] = (maxSequenceLengthSharedMemory - i);
19379 locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i);
19380 i = maxSequenceLengthSharedMemory;
19381 successSplit = 1;
19382 }
19383 }
19384 }*/
19385 uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
19386 for (uint64_t i = 0; i < sqrtSequence; i++) {
19387 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) {
19388 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) {
19389 locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i);
19390 locAxisSplit[1] = sqrtSequence - i;
19391 i = sqrtSequence;
19392 successSplit = 1;
19393 }
19394 }
19395 }
19396 }
19397 else {
19398 uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]));
19399 for (uint64_t i = 0; i < sqrtSequence; i++) {
19400 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) {
19401 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) {
19402 locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i);
19403 locAxisSplit[1] = sqrtSequence - i;
19404 i = sqrtSequence;
19405 successSplit = 1;
19406 }
19407 }
19408 }
19409 }
19410 if (successSplit == 0)
19411 numPasses = 3;
19412 }
19413 }
19414 if (numPasses == 3) {
19415 if (isPowOf2) {
19416 uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19417 if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) {
19418 //unit stride
19419 uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19420 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided)
19421 locAxisSplit[0] = maxPow8SharedMemory;
19422 else {
19423 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided)
19424 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19425 else {
19426 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19427 for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) {
19428 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19429 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
19430 i = (uint64_t)log2(registerBoost) + 1;
19431 }
19432 }
19433 }
19434 else {
19435 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19436 }
19437 }
19438 }
19439 }
19440 else {
19441 //to account for TLB misses, it is best to coalesce the unit-strided stage to 128 bytes
19442 /*uint64_t log2axis = (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]);
19443 locAxisSplit[0] = (uint64_t)pow(2, (uint64_t)log2axis / 3);
19444 if (log2axis % 3 > 0) locAxisSplit[0] *= 2;
19445 locAxisSplit[1] = (uint64_t)pow(2, (uint64_t)log2axis / 3);
19446 if (log2axis % 3 > 1) locAxisSplit[1] *= 2;
19447 locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / locAxisSplit[1];*/
19448 uint64_t maxSingleSizeStrided128 = app->configuration.sharedMemorySize / (128);
19449 uint64_t maxPow8_128 = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3);
19450 //unit stride
19451 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided)
19452 locAxisSplit[0] = maxPow8_128;
19453 //non-unit stride
19454 else {
19455
19456 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) {
19457 locAxisSplit[0] = maxPow8_128 * 2;
19458 }
19459 else {
19460 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) {
19461 locAxisSplit[0] = maxPow8_128 * 4;
19462 }
19463 else {
19464 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) {
19465 for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) {
19466 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19467 locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)pow(2, i));
19468 i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1;
19469 }
19470 }
19471 }
19472 else
19473 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19474 }
19475 }
19476 }
19477 }
19478 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) {
19479 locAxisSplit[1] = maxPow8Strided;
19480 locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19481 }
19482 else {
19483 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) {
19484 locAxisSplit[1] = maxSingleSizeStrided;
19485 locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19486 }
19487 else {
19488 locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth;
19489 locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19490 }
19491 }
19492 if (locAxisSplit[2] < 64) {
19493 locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]);
19494 locAxisSplit[2] = 64;
19495 }
19496 if (locAxisSplit[2] > locAxisSplit[1]) {
19497 uint64_t swap = locAxisSplit[1];
19498 locAxisSplit[1] = locAxisSplit[2];
19499 locAxisSplit[2] = swap;
19500 }
19501 }
19502 else {
19503 uint64_t successSplit = 0;
19504 if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) {
19505 for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) {
19506 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
19507 uint64_t sqrt3Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
19508 for (uint64_t j = 0; j < sqrt3Sequence; j++) {
19509 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) {
19510 if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) {
19511 locAxisSplit[0] = (maxSequenceLengthSharedMemory - i);
19512 locAxisSplit[1] = sqrt3Sequence - j;
19513 locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j);
19514 i = maxSequenceLengthSharedMemory;
19515 j = sqrt3Sequence;
19516 successSplit = 1;
19517 }
19518 }
19519 }
19520 }
19521 }
19522 }
19523 else {
19524 uint64_t sqrt3Sequence = (uint64_t)ceil(pow(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0));
19525 for (uint64_t i = 0; i < sqrt3Sequence; i++) {
19526 if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrt3Sequence - i) == 0) {
19527 uint64_t sqrt2Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
19528 for (uint64_t j = 0; j < sqrt2Sequence; j++) {
19529 if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) {
19530 if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) {
19531 locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j);
19532 locAxisSplit[1] = sqrt3Sequence - i;
19533 locAxisSplit[2] = sqrt2Sequence - j;
19534 i = sqrt3Sequence;
19535 j = sqrt2Sequence;
19536 successSplit = 1;
19537 }
19538 }
19539 }
19540 }
19541 }
19542 }
19543 if (successSplit == 0)
19544 numPasses = 4;
19545 }
19546 }
19547 if (numPasses > 3) {
19548 //printf("sequence length exceeds boundaries\n");
19550 }
19551 if ((numPasses > 1) && (app->configuration.performDCT > 0)) {
19552 //printf("sequence length exceeds boundaries\n");
19554 }
19555 if ((numPasses > 1) && (app->configuration.performR2C > 0) && (axis_id == 0) && (FFTPlan->actualFFTSizePerAxis[0][0] % 2 != 0)) {
19556 //printf("sequence length exceeds boundaries\n");
19558 }
19559 if (((app->configuration.reorderFourStep) && (!app->useBluesteinFFT[axis_id]))) {
19560 for (uint64_t i = 0; i < numPasses; i++) {
19561 if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) {
19562 uint64_t swap = locAxisSplit[0];
19563 locAxisSplit[0] = locAxisSplit[i];
19564 locAxisSplit[i] = swap;
19565 }
19566 }
19567 for (uint64_t i = 0; i < numPasses; i++) {
19568 if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) {
19569 uint64_t swap = locAxisSplit[0];
19570 locAxisSplit[0] = locAxisSplit[i];
19571 locAxisSplit[i] = swap;
19572 }
19573 }
19574 for (uint64_t i = 0; i < numPasses; i++) {
19575 if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) {
19576 uint64_t swap = locAxisSplit[0];
19577 locAxisSplit[0] = locAxisSplit[i];
19578 locAxisSplit[i] = swap;
19579 }
19580 }
19581 }
19582 FFTPlan->numAxisUploads[axis_id] = numPasses;
19583 for (uint64_t k = 0; k < numPasses; k++) {
19584 tempSequence = locAxisSplit[k];
19585 uint64_t loc_multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; //split the smaller sequence
19586 for (uint64_t i = 2; i < 14; i++) {
19587 if (tempSequence % i == 0) {
19588 tempSequence /= i;
19589 loc_multipliers[i]++;
19590 i--;
19591 }
19592 }
19593 uint64_t registers_per_thread_per_radix[14];
19594 uint64_t registers_per_thread = 0;
19595 uint64_t min_registers_per_thread = -1;
19596 uint64_t isGoodSequence = 0;
19597 res = VkFFTGetRegistersPerThread(loc_multipliers, registers_per_thread_per_radix, &registers_per_thread, &min_registers_per_thread, &isGoodSequence);
19598 if (res != VKFFT_SUCCESS) return res;
19599 registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2];
19600 registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2];
19601 if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
19602 registers_per_thread *= 2;
19603 for (uint64_t i = 2; i < 14; i++) {
19604 registers_per_thread_per_radix[i] *= 2;
19605 }
19606 min_registers_per_thread *= 2;
19607 }
19608 if (registers_per_thread_per_radix[8] % 8 == 0) {
19609 loc_multipliers[8] = loc_multipliers[2] / 3;
19610 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
19611 }
19612 if (registers_per_thread_per_radix[4] % 4 == 0) {
19613 loc_multipliers[4] = loc_multipliers[2] / 2;
19614 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
19615 }
19616 if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
19617 if (loc_multipliers[4] > 0) {
19618 loc_multipliers[4]--;
19619 loc_multipliers[2] = 2;
19620 }
19621 else {
19622 loc_multipliers[8]--;
19623 loc_multipliers[4]++;
19624 loc_multipliers[2]++;
19625 }
19626 }
19627 if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
19628 loc_multipliers[8]--;
19629 loc_multipliers[4]++;
19630 loc_multipliers[2]++;
19631 }
19632 uint64_t maxBatchCoalesced = ((axis_id == 0) && (((k == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (numPasses == 1))) ? 1 : app->configuration.coalescedMemory / complexSize;
19633 if (maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost) > app->configuration.maxThreadsNum)
19634 {
19635 uint64_t scaleRegistersNum = 1;
19636 while ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->configuration.maxThreadsNum) {
19637 for (uint64_t i = 2; i < 14; i++) {
19638 if (locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum) % i == 0) {
19639 scaleRegistersNum *= i;
19640 i = 14;
19641 }
19642 }
19643 }
19644 min_registers_per_thread *= scaleRegistersNum;
19645 uint64_t temp_scaleRegistersNum = scaleRegistersNum;
19646 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19647 registers_per_thread *= temp_scaleRegistersNum;
19648 for (uint64_t i = 2; i < 14; i++) {
19649 if (registers_per_thread_per_radix[i] != 0) {
19650 temp_scaleRegistersNum = scaleRegistersNum;
19651 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread_per_radix[i] * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19652 registers_per_thread_per_radix[i] *= temp_scaleRegistersNum;
19653 }
19654 }
19655
19656 if (min_registers_per_thread > registers_per_thread) {
19657 uint64_t temp = min_registers_per_thread;
19658 min_registers_per_thread = registers_per_thread;
19659 registers_per_thread = temp;
19660 }
19661 for (uint64_t i = 2; i < 14; i++) {
19662 if (registers_per_thread_per_radix[i] > registers_per_thread) {
19663 registers_per_thread = registers_per_thread_per_radix[i];
19664 }
19665 if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) {
19666 min_registers_per_thread = registers_per_thread_per_radix[i];
19667 }
19668 }
19669 }
19670 uint64_t j = 0;
19671 axes[k].specializationConstants.registerBoost = registerBoost;
19672 axes[k].specializationConstants.registers_per_thread = registers_per_thread;
19673 axes[k].specializationConstants.min_registers_per_thread = min_registers_per_thread;
19674 for (uint64_t i = 2; i < 14; i++) {
19675 axes[k].specializationConstants.registers_per_thread_per_radix[i] = registers_per_thread_per_radix[i];
19676 }
19678 axes[k].specializationConstants.fftDim = locAxisSplit[k];
19679 uint64_t tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep)||(app->useBluesteinFFT[axis_id]))) ? (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeStrided);
19680 uint64_t switchRegisterBoost = 0;
19681 if (tempRegisterBoost > 1) {
19682 if (loc_multipliers[tempRegisterBoost] > 0) {
19683 loc_multipliers[tempRegisterBoost]--;
19684 switchRegisterBoost = tempRegisterBoost;
19685 }
19686 else {
19687 for (uint64_t i = 14; i > 1; i--) {
19688 if (loc_multipliers[i] > 0) {
19689 loc_multipliers[i]--;
19690 switchRegisterBoost = i;
19691 i = 1;
19692 }
19693 }
19694 }
19695 }
19696 for (uint64_t i = 14; i > 1; i--) {
19697 if (loc_multipliers[i] > 0) {
19698 axes[k].specializationConstants.stageRadix[j] = i;
19699 loc_multipliers[i]--;
19700 i++;
19701 j++;
19703 }
19704 }
19705 if (switchRegisterBoost > 0) {
19706 axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = switchRegisterBoost;
19708 }
19709 else {
19710 if (min_registers_per_thread != registers_per_thread) {
19711 for (uint64_t i = 0; i < axes[k].specializationConstants.numStages; i++) {
19712 if (axes[k].specializationConstants.registers_per_thread_per_radix[axes[k].specializationConstants.stageRadix[i]] == min_registers_per_thread) {
19713 j = axes[k].specializationConstants.stageRadix[i];
19715 axes[k].specializationConstants.stageRadix[0] = j;
19717 }
19718 }
19719 }
19720 }
19721 }
19722 return VKFFT_SUCCESS;
19723}
19724static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t supportAxis) {
19725 //generate two arrays used for Blueestein convolution and post-convolution multiplication
19726 double double_PI = 3.1415926535897932384626433832795;
19727 VkFFTResult resFFT = VKFFT_SUCCESS;
19728 VkFFTApplication kernelPreparationApplication = {};
19729 VkFFTConfiguration kernelPreparationConfiguration = {};
19730
19731 kernelPreparationConfiguration.FFTdim = 1;
19732 kernelPreparationConfiguration.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
19733 kernelPreparationConfiguration.size[1] = 1;
19734 kernelPreparationConfiguration.size[2] = 1;
19735 kernelPreparationConfiguration.doublePrecision = app->configuration.doublePrecision;
19736 kernelPreparationConfiguration.useLUT = 1;
19737 kernelPreparationConfiguration.registerBoost = 1;
19738 kernelPreparationConfiguration.disableReorderFourStep = 1;
19739 kernelPreparationConfiguration.performBandwidthBoost = (app->configuration.performBandwidthBoost>0) ? app->configuration.performBandwidthBoost : 2;
19740 if (axis_id == 0) kernelPreparationConfiguration.performBandwidthBoost = 0;
19741 if (axis_id > 0) kernelPreparationConfiguration.considerAllAxesStrided = 1;
19742 if (app->configuration.tempBuffer) {
19743 kernelPreparationConfiguration.userTempBuffer = 1;
19744 kernelPreparationConfiguration.tempBuffer = app->configuration.tempBuffer;
19745 kernelPreparationConfiguration.tempBufferSize = app->configuration.tempBufferSize;
19746 kernelPreparationConfiguration.tempBufferNum = app->configuration.tempBufferNum;
19747 }
19748 kernelPreparationConfiguration.device = app->configuration.device;
19749#if(VKFFT_BACKEND==0)
19750 kernelPreparationConfiguration.queue = app->configuration.queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
19751 kernelPreparationConfiguration.fence = app->configuration.fence;
19752 kernelPreparationConfiguration.commandPool = app->configuration.commandPool;
19753 kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice;
19754 kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization
19755 kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory;
19756#elif(VKFFT_BACKEND==3)
19757 kernelPreparationConfiguration.platform = app->configuration.platform;
19758 kernelPreparationConfiguration.context = app->configuration.context;
19759#endif
19760
19761 uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2];
19762 if (kernelPreparationConfiguration.doublePrecision) bufferSize *= sizeof(double) / sizeof(float);
19763 app->bufferBluesteinSize[axis_id] = bufferSize;
19764 kernelPreparationConfiguration.inputBufferSize = &app->bufferBluesteinSize[axis_id];
19765 kernelPreparationConfiguration.bufferSize = &app->bufferBluesteinSize[axis_id];
19766 kernelPreparationConfiguration.isInputFormatted = 1;
19767
19768 resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration);
19769 if (resFFT != VKFFT_SUCCESS) return resFFT;
19770
19771#if(VKFFT_BACKEND==0)
19772 VkResult res = VK_SUCCESS;
19773 resFFT = allocateFFTBuffer(app, &app->bufferBluestein[axis_id], &app->bufferBluesteinDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
19774 if (resFFT != VKFFT_SUCCESS) return resFFT;
19776 resFFT = allocateFFTBuffer(app, &app->bufferBluesteinFFT[axis_id], &app->bufferBluesteinFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
19777 if (resFFT != VKFFT_SUCCESS) return resFFT;
19778 }
19780 resFFT = allocateFFTBuffer(app, &app->bufferBluesteinIFFT[axis_id], &app->bufferBluesteinIFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
19781 if (resFFT != VKFFT_SUCCESS) return resFFT;
19782 }
19783#elif(VKFFT_BACKEND==1)
19784 cudaError_t res = cudaSuccess;
19785 res = cudaMalloc((void**)&app->bufferBluestein[axis_id], bufferSize);
19786 if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19788 res = cudaMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize);
19789 if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19790 }
19792 res = cudaMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize);
19793 if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19794 }
19795#elif(VKFFT_BACKEND==2)
19796 hipError_t res = hipSuccess;
19797 res = hipMalloc((void**)&app->bufferBluestein[axis_id], bufferSize);
19798 if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19800 res = hipMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize);
19801 if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19802 }
19804 res = hipMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize);
19805 if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19806 }
19807#elif(VKFFT_BACKEND==3)
19808 cl_int res = CL_SUCCESS;
19809 app->bufferBluestein[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res);
19810 if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19812 app->bufferBluesteinFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res);
19813 if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19814 }
19816 app->bufferBluesteinIFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res);
19817 if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
19818 }
19819 cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res);
19820 if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE;
19821#endif
19822 void* phaseVectors = malloc(bufferSize);
19823 if (!phaseVectors) {
19824 deleteVkFFT(&kernelPreparationApplication);
19825 deleteVkFFT(app);
19827 }
19828 uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || (FFTPlan->multiUploadR2C)) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id];
19829 if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2;
19830 if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) {
19831 if (kernelPreparationConfiguration.doublePrecision) {
19832 double* phaseVectors_cast = (double*)phaseVectors;
19833 for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
19834 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19835 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19836 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0;
19837 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)-sin(angle) : 0;
19838 }
19839 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19840 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19841 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19842 }
19843 }
19844 else {
19845 float* phaseVectors_cast = (float*)phaseVectors;
19846 for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
19847 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19848 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19849 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0;
19850 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)-sin(angle) : 0;
19851 }
19852 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19853 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19854 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19855 }
19856 }
19857#if(VKFFT_BACKEND==0)
19858 resFFT = transferDataFromCPU(&kernelPreparationApplication, phaseVectors, &app->bufferBluestein[axis_id], bufferSize);
19859 if (resFFT != VKFFT_SUCCESS) {
19860 free(phaseVectors);
19861 deleteVkFFT(&kernelPreparationApplication);
19862 return resFFT;
19863 }
19864#elif(VKFFT_BACKEND==1)
19865 res = cudaMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
19866 if (res != cudaSuccess) {
19867 free(phaseVectors);
19868 deleteVkFFT(&kernelPreparationApplication);
19870 }
19871#elif(VKFFT_BACKEND==2)
19872 res = hipMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
19873 if (res != hipSuccess) {
19874 free(phaseVectors);
19875 deleteVkFFT(&kernelPreparationApplication);
19877 }
19878#elif(VKFFT_BACKEND==3)
19879 res = clEnqueueWriteBuffer(commandQueue, app->bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
19880 if (res != CL_SUCCESS) {
19881 free(phaseVectors);
19882 deleteVkFFT(&kernelPreparationApplication);
19884 }
19885#endif
19886#if(VKFFT_BACKEND==0)
19887 {
19888 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
19889 commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0];
19890 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
19891 commandBufferAllocateInfo.commandBufferCount = 1;
19892 VkCommandBuffer commandBuffer = {};
19893 res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer);
19894 if (res != 0) {
19895 free(phaseVectors);
19896 deleteVkFFT(&kernelPreparationApplication);
19898 }
19899 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
19900 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
19901 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
19902 if (res != 0) {
19903 free(phaseVectors);
19904 deleteVkFFT(&kernelPreparationApplication);
19906 }
19907 VkFFTLaunchParams launchParams = {};
19908 launchParams.commandBuffer = &commandBuffer;
19909 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
19910 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
19911 //Record commands
19912 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19913 if (resFFT != VKFFT_SUCCESS) {
19914 free(phaseVectors);
19915 deleteVkFFT(&kernelPreparationApplication);
19916 return resFFT;
19917 }
19918 res = vkEndCommandBuffer(commandBuffer);
19919 if (res != 0) {
19920 free(phaseVectors);
19921 deleteVkFFT(&kernelPreparationApplication);
19923 }
19924 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
19925 submitInfo.commandBufferCount = 1;
19926 submitInfo.pCommandBuffers = &commandBuffer;
19927 res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]);
19928 if (res != 0) {
19929 free(phaseVectors);
19930 deleteVkFFT(&kernelPreparationApplication);
19932 }
19933 res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000);
19934 if (res != 0) {
19935 free(phaseVectors);
19936 deleteVkFFT(&kernelPreparationApplication);
19938 }
19939 res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence);
19940 if (res != 0) {
19941 free(phaseVectors);
19942 deleteVkFFT(&kernelPreparationApplication);
19944 }
19945 vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer);
19946 }
19947#elif(VKFFT_BACKEND==1)
19948 VkFFTLaunchParams launchParams = {};
19949 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
19950 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
19951 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19952 if (resFFT != VKFFT_SUCCESS) {
19953 free(phaseVectors);
19954 deleteVkFFT(&kernelPreparationApplication);
19955 return resFFT;
19956 }
19957 res = cudaDeviceSynchronize();
19958 if (res != cudaSuccess) {
19959 free(phaseVectors);
19960 deleteVkFFT(&kernelPreparationApplication);
19962 }
19963#elif(VKFFT_BACKEND==2)
19964 VkFFTLaunchParams launchParams = {};
19965 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
19966 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
19967 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19968 if (resFFT != VKFFT_SUCCESS) {
19969 free(phaseVectors);
19970 deleteVkFFT(&kernelPreparationApplication);
19971 return resFFT;
19972 }
19973 res = hipDeviceSynchronize();
19974 if (res != hipSuccess) {
19975 free(phaseVectors);
19976 deleteVkFFT(&kernelPreparationApplication);
19978 }
19979#elif(VKFFT_BACKEND==3)
19980 VkFFTLaunchParams launchParams = {};
19981 launchParams.commandQueue = &commandQueue;
19982 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
19983 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
19984 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19985 if (resFFT != VKFFT_SUCCESS) {
19986 free(phaseVectors);
19987 deleteVkFFT(&kernelPreparationApplication);
19988 return resFFT;
19989 }
19990 res = clFinish(commandQueue);
19991 if (res != CL_SUCCESS) {
19992 free(phaseVectors);
19993 deleteVkFFT(&kernelPreparationApplication);
19995 }
19996#endif
19997 }
19998 if (kernelPreparationConfiguration.doublePrecision) {
19999 double* phaseVectors_cast = (double*)phaseVectors;
20000 for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
20001 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20002 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20003 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0;
20004 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)sin(angle) : 0;
20005 }
20006 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20007 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20008 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20009 }
20010 }
20011 else {
20012 float* phaseVectors_cast = (float*)phaseVectors;
20013 for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) {
20014 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20015 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20016 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0;
20017 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)sin(angle) : 0;
20018 }
20019 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20020 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20021 phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20022 }
20023 }
20024#if(VKFFT_BACKEND==0)
20025 resFFT = transferDataFromCPU(&kernelPreparationApplication, phaseVectors, &app->bufferBluestein[axis_id], bufferSize);
20026 if (resFFT != VKFFT_SUCCESS) {
20027 free(phaseVectors);
20028 deleteVkFFT(&kernelPreparationApplication);
20029 return resFFT;
20030 }
20031#elif(VKFFT_BACKEND==1)
20032 res = cudaMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
20033 if (res != cudaSuccess) {
20034 free(phaseVectors);
20035 deleteVkFFT(&kernelPreparationApplication);
20037 }
20038#elif(VKFFT_BACKEND==2)
20039 res = hipMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
20040 if (res != hipSuccess) {
20041 free(phaseVectors);
20042 deleteVkFFT(&kernelPreparationApplication);
20044 }
20045#elif(VKFFT_BACKEND==3)
20046 res = clEnqueueWriteBuffer(commandQueue, app->bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
20047 if (res != CL_SUCCESS) {
20048 free(phaseVectors);
20049 deleteVkFFT(&kernelPreparationApplication);
20051 }
20052#endif
20053#if(VKFFT_BACKEND==0)
20055 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
20056 commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0];
20057 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
20058 commandBufferAllocateInfo.commandBufferCount = 1;
20059 VkCommandBuffer commandBuffer = {};
20060 res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer);
20061 if (res != 0) {
20062 free(phaseVectors);
20063 deleteVkFFT(&kernelPreparationApplication);
20065 }
20066 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
20067 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
20068 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
20069 if (res != 0) {
20070 free(phaseVectors);
20071 deleteVkFFT(&kernelPreparationApplication);
20073 }
20074 VkFFTLaunchParams launchParams = {};
20075 launchParams.commandBuffer = &commandBuffer;
20076 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
20077 launchParams.buffer = &app->bufferBluesteinFFT[axis_id];
20078 //Record commands
20079 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20080 if (resFFT != VKFFT_SUCCESS) {
20081 free(phaseVectors);
20082 deleteVkFFT(&kernelPreparationApplication);
20083 return resFFT;
20084 }
20085 res = vkEndCommandBuffer(commandBuffer);
20086 if (res != 0) {
20087 free(phaseVectors);
20088 deleteVkFFT(&kernelPreparationApplication);
20090 }
20091 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20092 submitInfo.commandBufferCount = 1;
20093 submitInfo.pCommandBuffers = &commandBuffer;
20094 res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]);
20095 if (res != 0) {
20096 free(phaseVectors);
20097 deleteVkFFT(&kernelPreparationApplication);
20099 }
20100 res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000);
20101 if (res != 0) {
20102 free(phaseVectors);
20103 deleteVkFFT(&kernelPreparationApplication);
20105 }
20106 res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence);
20107 if (res != 0) {
20108 free(phaseVectors);
20109 deleteVkFFT(&kernelPreparationApplication);
20111 }
20112 vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer);
20113 }
20114 if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) {
20115 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
20116 commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0];
20117 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
20118 commandBufferAllocateInfo.commandBufferCount = 1;
20119 VkCommandBuffer commandBuffer = {};
20120 res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer);
20121 if (res != 0) {
20122 free(phaseVectors);
20123 deleteVkFFT(&kernelPreparationApplication);
20125 }
20126 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
20127 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
20128 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
20129 if (res != 0) {
20130 free(phaseVectors);
20131 deleteVkFFT(&kernelPreparationApplication);
20133 }
20134 VkFFTLaunchParams launchParams = {};
20135 launchParams.commandBuffer = &commandBuffer;
20136 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
20137 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
20138 //Record commands
20139 resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20140 if (resFFT != VKFFT_SUCCESS) {
20141 free(phaseVectors);
20142 deleteVkFFT(&kernelPreparationApplication);
20143 return resFFT;
20144 }
20145 res = vkEndCommandBuffer(commandBuffer);
20146 if (res != 0) {
20147 free(phaseVectors);
20148 deleteVkFFT(&kernelPreparationApplication);
20150 }
20151 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20152 submitInfo.commandBufferCount = 1;
20153 submitInfo.pCommandBuffers = &commandBuffer;
20154 res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]);
20155 if (res != 0) {
20156 free(phaseVectors);
20157 deleteVkFFT(&kernelPreparationApplication);
20159 }
20160 res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000);
20161 if (res != 0) {
20162 free(phaseVectors);
20163 deleteVkFFT(&kernelPreparationApplication);
20165 }
20166 res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence);
20167 if (res != 0) {
20168 free(phaseVectors);
20169 deleteVkFFT(&kernelPreparationApplication);
20171 }
20172 vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer);
20173 }
20174#elif(VKFFT_BACKEND==1)
20175 VkFFTLaunchParams launchParams = {};
20176 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
20178 launchParams.buffer = &app->bufferBluesteinFFT[axis_id];
20179 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20180 if (resFFT != VKFFT_SUCCESS) {
20181 free(phaseVectors);
20182 deleteVkFFT(&kernelPreparationApplication);
20183 return resFFT;
20184 }
20185 res = cudaDeviceSynchronize();
20186 if (res != cudaSuccess) {
20187 free(phaseVectors);
20188 deleteVkFFT(&kernelPreparationApplication);
20190 }
20191 }
20192 if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) {
20193 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
20194 resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20195 if (resFFT != VKFFT_SUCCESS) {
20196 free(phaseVectors);
20197 deleteVkFFT(&kernelPreparationApplication);
20198 return resFFT;
20199 }
20200 res = cudaDeviceSynchronize();
20201 if (res != cudaSuccess) {
20202 free(phaseVectors);
20203 deleteVkFFT(&kernelPreparationApplication);
20205 }
20206 }
20207#elif(VKFFT_BACKEND==2)
20208 VkFFTLaunchParams launchParams = {};
20209 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
20211 launchParams.buffer = &app->bufferBluesteinFFT[axis_id];
20212 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20213 if (resFFT != VKFFT_SUCCESS) {
20214 free(phaseVectors);
20215 deleteVkFFT(&kernelPreparationApplication);
20216 return resFFT;
20217 }
20218 res = hipDeviceSynchronize();
20219 if (res != hipSuccess) {
20220 free(phaseVectors);
20221 deleteVkFFT(&kernelPreparationApplication);
20223 }
20224 }
20225 if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) {
20226 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
20227 resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20228 if (resFFT != VKFFT_SUCCESS) {
20229 free(phaseVectors);
20230 deleteVkFFT(&kernelPreparationApplication);
20231 return resFFT;
20232 }
20233 res = hipDeviceSynchronize();
20234 if (res != hipSuccess) {
20235 free(phaseVectors);
20236 deleteVkFFT(&kernelPreparationApplication);
20238 }
20239 }
20240#elif(VKFFT_BACKEND==3)
20241 VkFFTLaunchParams launchParams = {};
20242 launchParams.commandQueue = &commandQueue;
20243 launchParams.inputBuffer = &app->bufferBluestein[axis_id];
20245 launchParams.buffer = &app->bufferBluesteinFFT[axis_id];
20246 resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20247 if (resFFT != VKFFT_SUCCESS) {
20248 free(phaseVectors);
20249 deleteVkFFT(&kernelPreparationApplication);
20250 return resFFT;
20251 }
20252 res = clFinish(commandQueue);
20253 if (res != CL_SUCCESS) {
20254 free(phaseVectors);
20255 deleteVkFFT(&kernelPreparationApplication);
20257 }
20258 }
20259 if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) {
20260 launchParams.buffer = &app->bufferBluesteinIFFT[axis_id];
20261 resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20262 if (resFFT != VKFFT_SUCCESS) {
20263 free(phaseVectors);
20264 deleteVkFFT(&kernelPreparationApplication);
20265 return resFFT;
20266 }
20267 res = clFinish(commandQueue);
20268 if (res != CL_SUCCESS) {
20269 free(phaseVectors);
20270 deleteVkFFT(&kernelPreparationApplication);
20272 }
20273 }
20274#endif
20275 free(phaseVectors);
20276#if(VKFFT_BACKEND==0)
20277 kernelPreparationApplication.configuration.isCompilerInitialized = 0;
20278#elif(VKFFT_BACKEND==3)
20279 res = clReleaseCommandQueue(commandQueue);
20280 if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE;
20281#endif
20282 deleteVkFFT(&kernelPreparationApplication);
20283 return resFFT;
20284}
20285static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFTAxis* axis, uint64_t planStage, VkFFTLaunchParams* launchParams) {
20286 uint64_t performUpdate = planStage;
20287 if (!planStage) {
20288 if (launchParams != 0) {
20289 if ((launchParams->buffer != 0) && (app->configuration.buffer != launchParams->buffer)) {
20290 app->configuration.buffer = launchParams->buffer;
20291 performUpdate = 1;
20292 }
20293 if ((launchParams->inputBuffer != 0) && (app->configuration.inputBuffer != launchParams->inputBuffer)) {
20294 app->configuration.inputBuffer = launchParams->inputBuffer;
20295 performUpdate = 1;
20296 }
20297 if ((launchParams->outputBuffer != 0) && (app->configuration.outputBuffer != launchParams->outputBuffer)) {
20298 app->configuration.outputBuffer = launchParams->outputBuffer;
20299 performUpdate = 1;
20300 }
20301 if ((launchParams->tempBuffer != 0) && (app->configuration.tempBuffer != launchParams->tempBuffer)) {
20302 app->configuration.tempBuffer = launchParams->tempBuffer;
20303 performUpdate = 1;
20304 }
20305 if ((launchParams->kernel != 0) && (app->configuration.kernel != launchParams->kernel)) {
20306 app->configuration.kernel = launchParams->kernel;
20307 performUpdate = 1;
20308 }
20311 }
20312 }
20313 if (planStage) {
20314 if (app->configuration.buffer == 0) {
20315 performUpdate = 0;
20316 }
20317 if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) {
20318 performUpdate = 0;
20319 }
20320 if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) {
20321 performUpdate = 0;
20322 }
20323 if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) {
20324 performUpdate = 0;
20325 }
20326 if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) {
20327 performUpdate = 0;
20328 }
20329 }
20330 else {
20331 if (app->configuration.buffer == 0) {
20333 }
20334 if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) {
20336 }
20337 if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) {
20339 }
20340 if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) {
20342 }
20343 if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) {
20345 }
20346 }
20347 if (performUpdate) {
20348 if (planStage) axis->specializationConstants.performBufferSetUpdate = 1;
20349 else {
20351 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
20352 for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++)
20354 if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
20355 for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++)
20357 }
20358 }
20359 if (app->localFFTPlan->multiUploadR2C) {
20361 }
20362 }
20364 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
20365 for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
20367 if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
20368 for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
20370 }
20371 }
20374 }
20375 }
20376 }
20377 }
20378 return VKFFT_SUCCESS;
20379}
20380static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
20381 if (axis->specializationConstants.performBufferSetUpdate) {
20382#if(VKFFT_BACKEND==0)
20383 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
20384#endif
20385 uint64_t storageComplexSize;
20387 storageComplexSize = (2 * sizeof(double));
20388 else
20390 storageComplexSize = (2 * 2);
20391 else
20392 storageComplexSize = (2 * sizeof(float));
20393 for (uint64_t i = 0; i < axis->numBindings; ++i) {
20394 for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
20395#if(VKFFT_BACKEND==0)
20396 VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
20397#endif
20398 if (i == 0) {
20399 if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && (
20400 ((axis_id == app->firstAxis) && (!inverse))
20401 || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
20402 ) {
20403 uint64_t bufferId = 0;
20404 uint64_t offset = j;
20406 {
20407 for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
20408 if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20409 bufferId++;
20410 offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20411 }
20412 else {
20414 }
20415
20416 }
20417 }
20418 axis->inputBuffer = app->configuration.inputBuffer;
20419#if(VKFFT_BACKEND==0)
20420 descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
20421 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20422 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20423#endif
20424 axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset;
20425 }
20426 else {
20427 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
20428 uint64_t bufferId = 0;
20429 uint64_t offset = j;
20431 {
20432 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20433 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20434 bufferId++;
20435 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20436 }
20437 else {
20439 }
20440
20441 }
20442 }
20443 axis->inputBuffer = app->configuration.outputBuffer;
20444#if(VKFFT_BACKEND==0)
20445 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20446 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20447 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20448#endif
20449 axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset;
20450 }
20451 else {
20452 uint64_t bufferId = 0;
20453 uint64_t offset = j;
20454 if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
20455 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) {
20456 if (app->configuration.bufferSize)
20457 {
20458 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20459 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20460 bufferId++;
20461 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20462 }
20463 else {
20464 l = app->configuration.bufferNum;
20465 }
20466
20467 }
20468 }
20469 axis->inputBuffer = app->configuration.buffer;
20470#if(VKFFT_BACKEND==0)
20471 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20472#endif
20473 axis->specializationConstants.inputOffset = app->configuration.bufferOffset;
20474 }
20475 else {
20476 if (app->configuration.tempBufferSize) {
20477 for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) {
20478 if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20479 bufferId++;
20480 offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20481 }
20482 else {
20484 }
20485
20486 }
20487 }
20488 axis->inputBuffer = app->configuration.tempBuffer;
20489#if(VKFFT_BACKEND==0)
20490 descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId];
20491#endif
20492 axis->specializationConstants.inputOffset = app->configuration.tempBufferOffset;
20493 }
20494 }
20495 else {
20496 if (app->configuration.bufferSize) {
20497 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20498 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20499 bufferId++;
20500 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20501 }
20502 else {
20503 l = app->configuration.bufferNum;
20504 }
20505
20506 }
20507 }
20508 axis->inputBuffer = app->configuration.buffer;
20509#if(VKFFT_BACKEND==0)
20510 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20511#endif
20512 axis->specializationConstants.inputOffset = app->configuration.bufferOffset;
20513 }
20514#if(VKFFT_BACKEND==0)
20515 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20516 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20517#endif
20518 }
20519 }
20520 //descriptorBufferInfo.offset = 0;
20521 }
20522 if (i == 1) {
20523 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
20524 ((axis_id == app->firstAxis) && (inverse))
20525 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
20526 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
20527 )) ||
20528 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
20529 ((axis_id == app->firstAxis) && (inverse))
20530 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
20531 )) ||
20532 ((app->configuration.numberKernels > 1) && (
20533 (inverse)
20534 || (axis_id == app->lastAxis)))
20535 ) {
20536 uint64_t bufferId = 0;
20537 uint64_t offset = j;
20539 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20540 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20541 bufferId++;
20542 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20543 }
20544 else {
20546 }
20547
20548 }
20549 }
20550 axis->outputBuffer = app->configuration.outputBuffer;
20551#if(VKFFT_BACKEND==0)
20552 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20553 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20554 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20555#endif
20556 axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset;
20557 }
20558 else {
20559 uint64_t bufferId = 0;
20560 uint64_t offset = j;
20561
20562 if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
20563 if ((inverse) && (axis_id == app->firstAxis) && (
20564 ((axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer) && (!app->useBluesteinFFT[axis_id]))
20565 || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (axis->specializationConstants.actualInverse) && (app->configuration.inverseReturnToInputBuffer) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1))))
20566 ) {
20567 if (app->configuration.inputBufferSize) {
20568 for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
20569 if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20570 bufferId++;
20571 offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20572 }
20573 else {
20575 }
20576
20577 }
20578 }
20579 axis->outputBuffer = app->configuration.inputBuffer;
20580#if(VKFFT_BACKEND==0)
20581 descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
20582#endif
20583 axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset;
20584 }
20585 else {
20586 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
20587 if (app->configuration.tempBufferSize) {
20588 for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) {
20589 if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20590 bufferId++;
20591 offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20592 }
20593 else {
20595 }
20596
20597 }
20598 }
20599 axis->outputBuffer = app->configuration.tempBuffer;
20600#if(VKFFT_BACKEND==0)
20601 descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId];
20602#endif
20603 axis->specializationConstants.outputOffset = app->configuration.tempBufferOffset;
20604 }
20605 else {
20606 if (app->configuration.bufferSize) {
20607 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20608 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20609 bufferId++;
20610 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20611 }
20612 else {
20613 l = app->configuration.bufferNum;
20614 }
20615
20616 }
20617 }
20618 axis->outputBuffer = app->configuration.buffer;
20619#if(VKFFT_BACKEND==0)
20620 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20621#endif
20622 axis->specializationConstants.outputOffset = app->configuration.bufferOffset;
20623 }
20624 }
20625 }
20626 else {
20627 if ((inverse) && (axis_id == app->firstAxis) && (axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer)) {
20628 if (app->configuration.inputBufferSize) {
20629 for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
20630 if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20631 bufferId++;
20632 offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20633 }
20634 else {
20636 }
20637
20638 }
20639 }
20640 axis->outputBuffer = app->configuration.inputBuffer;
20641#if(VKFFT_BACKEND==0)
20642 descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
20643#endif
20644 axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset;
20645 }
20646 else {
20647 if (app->configuration.bufferSize) {
20648 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20649 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20650 bufferId++;
20651 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20652 }
20653 else {
20654 l = app->configuration.bufferNum;
20655 }
20656
20657 }
20658 }
20659 axis->outputBuffer = app->configuration.buffer;
20660#if(VKFFT_BACKEND==0)
20661 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20662#endif
20663 axis->specializationConstants.outputOffset = app->configuration.bufferOffset;
20664 }
20665 }
20666#if(VKFFT_BACKEND==0)
20667 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20668 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20669#endif
20670 }
20671 //descriptorBufferInfo.offset = 0;
20672 }
20673 if ((i == axis->specializationConstants.convolutionBindingID) && (app->configuration.performConvolution)) {
20674 uint64_t bufferId = 0;
20675 uint64_t offset = j;
20676 if (app->configuration.kernelSize) {
20677 for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) {
20678 if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20679 bufferId++;
20680 offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20681 }
20682 else {
20683 l = app->configuration.kernelNum;
20684 }
20685
20686 }
20687 }
20688#if(VKFFT_BACKEND==0)
20689 descriptorBufferInfo.buffer = app->configuration.kernel[bufferId];
20690 descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize);
20691 descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize);
20692#endif
20693 axis->specializationConstants.kernelOffset = app->configuration.kernelOffset;
20694 }
20695 if ((i == axis->specializationConstants.LUTBindingID) && (app->configuration.useLUT)) {
20696#if(VKFFT_BACKEND==0)
20697 descriptorBufferInfo.buffer = axis->bufferLUT;
20698 descriptorBufferInfo.offset = 0;
20699 descriptorBufferInfo.range = axis->bufferLUTSize;
20700#endif
20701 }
20702 if ((i == axis->specializationConstants.BluesteinConvolutionBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) {
20703#if(VKFFT_BACKEND==0)
20704 if (axis->specializationConstants.inverseBluestein)
20705 descriptorBufferInfo.buffer = app->bufferBluesteinIFFT[axis_id];
20706 else
20707 descriptorBufferInfo.buffer = app->bufferBluesteinFFT[axis_id];
20708 descriptorBufferInfo.offset = 0;
20709 descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id];
20710#endif
20711 }
20712 if ((i == axis->specializationConstants.BluesteinMultiplicationBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) {
20713#if(VKFFT_BACKEND==0)
20714 descriptorBufferInfo.buffer = app->bufferBluestein[axis_id];
20715 descriptorBufferInfo.offset = 0;
20716 descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id];
20717#endif
20718 }
20719#if(VKFFT_BACKEND==0)
20720 VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
20721 writeDescriptorSet.dstSet = axis->descriptorSet;
20722 writeDescriptorSet.dstBinding = (uint32_t)i;
20723 writeDescriptorSet.dstArrayElement = (uint32_t)j;
20724 writeDescriptorSet.descriptorType = descriptorType;
20725 writeDescriptorSet.descriptorCount = 1;
20726 writeDescriptorSet.pBufferInfo = &descriptorBufferInfo;
20727 vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0);
20728#endif
20729 }
20730 }
20731 axis->specializationConstants.performBufferSetUpdate = 0;
20732 }
20733 return VKFFT_SUCCESS;
20734}
20735static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) {
20736 if (axis->specializationConstants.performBufferSetUpdate) {
20737#if(VKFFT_BACKEND==0)
20738 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
20739#endif
20740 uint64_t storageComplexSize;
20742 storageComplexSize = (2 * sizeof(double));
20743 else
20745 storageComplexSize = (2 * 2);
20746 else
20747 storageComplexSize = (2 * sizeof(float));
20748 for (uint64_t i = 0; i < axis->numBindings; ++i) {
20749 for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) {
20750#if(VKFFT_BACKEND==0)
20751 VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
20752#endif
20753 if (i == 0) {
20754 uint64_t bufferId = 0;
20755 uint64_t offset = j;
20756 if (inverse) {
20757 if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && (
20758 ((axis_id == app->firstAxis) && (!inverse))
20759 || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
20760 ) {
20761 uint64_t bufferId = 0;
20762 uint64_t offset = j;
20763 if (app->configuration.inputBufferSize) {
20764 for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) {
20765 if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20766 bufferId++;
20767 offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20768 }
20769 else {
20771 }
20772
20773 }
20774 }
20775 axis->inputBuffer = app->configuration.inputBuffer;
20776#if(VKFFT_BACKEND==0)
20777 descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId];
20778 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20779 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20780#endif
20781 axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset;
20782 }
20783 else {
20784 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
20785 uint64_t bufferId = 0;
20786 uint64_t offset = j;
20788 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20789 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20790 bufferId++;
20791 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20792 }
20793 else {
20795 }
20796
20797 }
20798 }
20799 axis->inputBuffer = app->configuration.outputBuffer;
20800#if(VKFFT_BACKEND==0)
20801 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20802 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20803 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20804#endif
20805 axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset;
20806 }
20807 else {
20808 uint64_t bufferId = 0;
20809 uint64_t offset = j;
20810 if (app->configuration.bufferSize) {
20811 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20812 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20813 bufferId++;
20814 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20815 }
20816 else {
20817 l = app->configuration.bufferNum;
20818 }
20819
20820 }
20821 }
20822 axis->inputBuffer = app->configuration.buffer;
20823#if(VKFFT_BACKEND==0)
20824 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20825#endif
20826 axis->specializationConstants.inputOffset = app->configuration.bufferOffset;
20827#if(VKFFT_BACKEND==0)
20828 descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20829 descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20830#endif
20831 }
20832 }
20833 }
20834 else {
20835 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
20836 ((axis_id == app->firstAxis) && (inverse))
20837 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
20838 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
20839 )) ||
20840 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
20841 ((axis_id == app->firstAxis) && (inverse))
20842 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
20843 )) ||
20844 ((app->configuration.numberKernels > 1) && (
20845 (inverse)
20846 || (axis_id == app->lastAxis)))
20847 ) {
20848 uint64_t bufferId = 0;
20849 uint64_t offset = j;
20851 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20852 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20853 bufferId++;
20854 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20855 }
20856 else {
20858 }
20859
20860 }
20861 }
20862 axis->inputBuffer = app->configuration.outputBuffer;
20863#if(VKFFT_BACKEND==0)
20864 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20865 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20866 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20867#endif
20868 axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset;
20869 }
20870 else {
20871 uint64_t bufferId = 0;
20872 uint64_t offset = j;
20873 if (app->configuration.bufferSize) {
20874 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20875 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20876 bufferId++;
20877 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20878 }
20879 else {
20880 l = app->configuration.bufferNum;
20881 }
20882
20883 }
20884 }
20885 axis->inputBuffer = app->configuration.buffer;
20886#if(VKFFT_BACKEND==0)
20887 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20888#endif
20889 axis->specializationConstants.inputOffset = app->configuration.bufferOffset;
20890
20891#if(VKFFT_BACKEND==0)
20892 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20893 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20894#endif
20895 }
20896 }
20897 }
20898 if (i == 1) {
20899 if (inverse) {
20900 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
20901 uint64_t bufferId = 0;
20902 uint64_t offset = j;
20904 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20905 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20906 bufferId++;
20907 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20908 }
20909 else {
20911 }
20912
20913 }
20914 }
20915 axis->outputBuffer = app->configuration.outputBuffer;
20916#if(VKFFT_BACKEND==0)
20917 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20918 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20919 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20920#endif
20921 axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset;
20922 }
20923 else {
20924 uint64_t bufferId = 0;
20925 uint64_t offset = j;
20926 if (app->configuration.bufferSize) {
20927 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20928 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20929 bufferId++;
20930 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20931 }
20932 else {
20933 l = app->configuration.bufferNum;
20934 }
20935
20936 }
20937 }
20938 axis->outputBuffer = app->configuration.buffer;
20939#if(VKFFT_BACKEND==0)
20940 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
20941#endif
20942 axis->specializationConstants.outputOffset = app->configuration.bufferOffset;
20943#if(VKFFT_BACKEND==0)
20944 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20945 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20946#endif
20947 }
20948 }
20949 else {
20950 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
20951 ((axis_id == app->firstAxis) && (inverse))
20952 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
20953 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
20954 )) ||
20955 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
20956 ((axis_id == app->firstAxis) && (inverse))
20957 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
20958 )) ||
20959 ((app->configuration.numberKernels > 1) && (
20960 (inverse)
20961 || (axis_id == app->lastAxis)))
20962 ) {
20963 uint64_t bufferId = 0;
20964 uint64_t offset = j;
20966 for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) {
20967 if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20968 bufferId++;
20969 offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20970 }
20971 else {
20973 }
20974
20975 }
20976 }
20977 axis->outputBuffer = app->configuration.outputBuffer;
20978#if(VKFFT_BACKEND==0)
20979 descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId];
20980 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20981 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20982#endif
20983 axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset;
20984 }
20985 else {
20986 uint64_t bufferId = 0;
20987 uint64_t offset = j;
20988 if (app->configuration.bufferSize) {
20989 for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) {
20990 if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20991 bufferId++;
20992 offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20993 }
20994 else {
20995 l = app->configuration.bufferNum;
20996 }
20997
20998 }
20999 }
21000 axis->outputBuffer = app->configuration.buffer;
21001#if(VKFFT_BACKEND==0)
21002 descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
21003#endif
21004 axis->specializationConstants.outputOffset = app->configuration.bufferOffset;
21005
21006#if(VKFFT_BACKEND==0)
21007 descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
21008 descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
21009#endif
21010 }
21011 }
21012 }
21013 if ((i == 2) && (app->configuration.performConvolution)) {
21014 uint64_t bufferId = 0;
21015 uint64_t offset = j;
21016 if (app->configuration.kernelSize) {
21017 for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) {
21018 if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
21019 bufferId++;
21020 offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21021 }
21022 else {
21023 l = app->configuration.kernelNum;
21024 }
21025
21026 }
21027 }
21028#if(VKFFT_BACKEND==0)
21029 descriptorBufferInfo.buffer = app->configuration.kernel[bufferId];
21030 descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize);
21031 descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize);
21032#endif
21033 axis->specializationConstants.kernelOffset = app->configuration.kernelOffset;
21034 }
21035 if ((i == axis->numBindings - 1) && (app->configuration.useLUT)) {
21036#if(VKFFT_BACKEND==0)
21037 descriptorBufferInfo.buffer = axis->bufferLUT;
21038 descriptorBufferInfo.offset = 0;
21039 descriptorBufferInfo.range = axis->bufferLUTSize;
21040#endif
21041 }
21042#if(VKFFT_BACKEND==0)
21043 VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
21044 writeDescriptorSet.dstSet = axis->descriptorSet;
21045 writeDescriptorSet.dstBinding = (uint32_t)i;
21046 writeDescriptorSet.dstArrayElement = (uint32_t)j;
21047 writeDescriptorSet.descriptorType = descriptorType;
21048 writeDescriptorSet.descriptorCount = 1;
21049 writeDescriptorSet.pBufferInfo = &descriptorBufferInfo;
21050 vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0);
21051#endif
21052 }
21053 }
21054 axis->specializationConstants.performBufferSetUpdate = 0;
21055 }
21056 return VKFFT_SUCCESS;
21057}
21058static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t inverse) {
21059 //get radix stages
21060 VkFFTResult resFFT = VKFFT_SUCCESS;
21061#if(VKFFT_BACKEND==0)
21062 VkResult res = VK_SUCCESS;
21063#elif(VKFFT_BACKEND==1)
21064 cudaError_t res = cudaSuccess;
21065#elif(VKFFT_BACKEND==2)
21066 hipError_t res = hipSuccess;
21067#elif(VKFFT_BACKEND==3)
21068 cl_int res = CL_SUCCESS;
21069#endif
21070 VkFFTAxis* axis = &FFTPlan->R2Cdecomposition;
21071 axis->specializationConstants.warpSize = app->configuration.warpSize;
21072 axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks;
21073 axis->specializationConstants.useUint64 = app->configuration.useUint64;
21074 axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[0];
21075 uint64_t complexSize;
21077 complexSize = (2 * sizeof(double));
21078 else
21080 complexSize = (2 * sizeof(float));
21081 else
21082 complexSize = (2 * sizeof(float));
21083 axis->specializationConstants.complexSize = complexSize;
21084 axis->specializationConstants.supportAxis = 0;
21085 axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel;
21086 axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution;
21087 axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization;
21088 axis->specializationConstants.fft_dim_full = app->configuration.size[0];
21089 axis->specializationConstants.dispatchZactualFFTSize = 1;
21090 //allocate LUT
21091 if (app->configuration.useLUT) {
21092 double double_PI = 3.1415926535897932384626433832795;
21094 axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(double);
21095 double* tempLUT = (double*)malloc(axis->bufferLUTSize);
21096 if (!tempLUT) {
21097 deleteVkFFT(app);
21099 }
21100 for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) {
21101 double angle = double_PI * i / app->configuration.size[0];
21102 tempLUT[2 * i] = (double)cos(angle);
21103 tempLUT[2 * i + 1] = (double)sin(angle);
21104 }
21105 axis->referenceLUT = 0;
21106 if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
21107 axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT;
21108#if(VKFFT_BACKEND==0)
21109 axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory;
21110#endif
21111 axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize;
21112 axis->referenceLUT = 1;
21113 }
21114 else {
21115#if(VKFFT_BACKEND==0)
21116 resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
21117 if (resFFT != VKFFT_SUCCESS) {
21118 deleteVkFFT(app);
21119 free(tempLUT);
21120 tempLUT = 0;
21121 return resFFT;
21122 }
21123 resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
21124 if (resFFT != VKFFT_SUCCESS) {
21125 deleteVkFFT(app);
21126 free(tempLUT);
21127 tempLUT = 0;
21128 return resFFT;
21129 }
21130#elif(VKFFT_BACKEND==1)
21131 res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
21132 if (res != cudaSuccess) {
21133 deleteVkFFT(app);
21134 free(tempLUT);
21135 tempLUT = 0;
21137 }
21138 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
21139 if (res != cudaSuccess) {
21140 deleteVkFFT(app);
21141 free(tempLUT);
21142 tempLUT = 0;
21144 }
21145#elif(VKFFT_BACKEND==2)
21146 res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
21147 if (res != hipSuccess) {
21148 deleteVkFFT(app);
21149 free(tempLUT);
21150 tempLUT = 0;
21152 }
21153 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
21154 if (res != hipSuccess) {
21155 deleteVkFFT(app);
21156 free(tempLUT);
21157 tempLUT = 0;
21159 }
21160#elif(VKFFT_BACKEND==3)
21161 axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
21162 if (res != CL_SUCCESS) {
21163 deleteVkFFT(app);
21164 free(tempLUT);
21165 tempLUT = 0;
21167 }
21168#endif
21169 free(tempLUT);
21170 tempLUT = 0;
21171 }
21172 }
21173 else {
21174 axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(float);
21175 float* tempLUT = (float*)malloc(axis->bufferLUTSize);
21176 if (!tempLUT) {
21177 deleteVkFFT(app);
21179 }
21180 for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) {
21181 double angle = double_PI * i / (app->configuration.size[0] / 2);
21182 tempLUT[2 * i] = (float)cos(angle);
21183 tempLUT[2 * i + 1] = (float)sin(angle);
21184 }
21185 axis->referenceLUT = 0;
21186 if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
21187 axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT;
21188#if(VKFFT_BACKEND==0)
21189 axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory;
21190#endif
21191 axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize;
21192 axis->referenceLUT = 1;
21193 }
21194 else {
21195#if(VKFFT_BACKEND==0)
21196 resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
21197 if (resFFT != VKFFT_SUCCESS) {
21198 deleteVkFFT(app);
21199 free(tempLUT);
21200 tempLUT = 0;
21201 return resFFT;
21202 }
21203 resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
21204 if (resFFT != VKFFT_SUCCESS) {
21205 deleteVkFFT(app);
21206 free(tempLUT);
21207 tempLUT = 0;
21208 return resFFT;
21209 }
21210#elif(VKFFT_BACKEND==1)
21211 res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
21212 if (res != cudaSuccess) {
21213 deleteVkFFT(app);
21214 free(tempLUT);
21215 tempLUT = 0;
21217 }
21218 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
21219 if (res != cudaSuccess) {
21220 deleteVkFFT(app);
21221 free(tempLUT);
21222 tempLUT = 0;
21224 }
21225#elif(VKFFT_BACKEND==2)
21226 res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
21227 if (res != hipSuccess) {
21228 deleteVkFFT(app);
21229 free(tempLUT);
21230 tempLUT = 0;
21232 }
21233 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
21234 if (res != hipSuccess) {
21235 deleteVkFFT(app);
21236 free(tempLUT);
21237 tempLUT = 0;
21239 }
21240#elif(VKFFT_BACKEND==3)
21241 axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
21242 if (res != CL_SUCCESS) {
21243 deleteVkFFT(app);
21244 free(tempLUT);
21245 tempLUT = 0;
21247 }
21248#endif
21249 free(tempLUT);
21250 tempLUT = 0;
21251 }
21252 }
21253 }
21254 //configure strides
21255 uint64_t* axisStride = axis->specializationConstants.inputStride;
21256 uint64_t* usedStride = 0;
21257 if (app->useBluesteinFFT[0] && (FFTPlan->numAxisUploads[0] > 1)) {
21258 if (inverse)
21259 usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride;
21260 else
21261 usedStride = FFTPlan->inverseBluesteinAxes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.outputStride;
21262 }
21263 else {
21264 if (inverse)
21265 usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride;
21266 else
21267 usedStride = FFTPlan->axes[0][0].specializationConstants.outputStride;
21268 }
21269 axisStride[0] = usedStride[0];
21270 axisStride[1] = usedStride[1];
21271 axisStride[2] = usedStride[2];
21272 axisStride[3] = usedStride[3];
21273 axisStride[4] = usedStride[4];
21274
21275 axisStride = axis->specializationConstants.outputStride;
21276 usedStride = axis->specializationConstants.inputStride;
21277
21278 axisStride[0] = usedStride[0];
21279 axisStride[1] = usedStride[1];
21280 axisStride[2] = usedStride[2];
21281 axisStride[3] = usedStride[3];
21282 axisStride[4] = usedStride[4];
21283
21284 axis->specializationConstants.inverse = inverse;
21285
21286 uint64_t storageComplexSize;
21288 storageComplexSize = (2 * sizeof(double));
21289 else
21291 storageComplexSize = (2 * 2);
21292 else
21293 storageComplexSize = (2 * sizeof(float));
21294
21295 uint64_t initPageSize = -1;
21296 uint64_t locBufferNum = 1;
21297 uint64_t locBufferSize = 0;
21298 /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
21299 initPageSize += app->configuration.bufferSize[i];
21300 }
21301 if (app->configuration.performConvolution) {
21302 uint64_t initPageSizeKernel = 0;
21303 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
21304 initPageSizeKernel += app->configuration.kernelSize[i];
21305 }
21306 if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel;
21307 }
21308 if ((!((!app->configuration.reorderFourStep))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
21309 initPageSize = app->configuration.localPageSize * 1024;
21310 }*/
21311 uint64_t axis_id = 0;
21312 uint64_t axis_upload_id = 0;
21313
21314 {
21315 uint64_t totalSize = 0;
21316 uint64_t locPageSize = initPageSize;
21317 if (inverse) {
21318 if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && (
21319 ((axis_id == app->firstAxis) && (!inverse))
21320 || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
21321 ) {
21322 uint64_t totalSize = 0;
21323 uint64_t locPageSize = initPageSize;
21324 locBufferNum = app->configuration.inputBufferNum;
21325 if (app->configuration.inputBufferSize) {
21326 locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize);
21327 for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
21328 totalSize += app->configuration.inputBufferSize[i];
21329 if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
21330 }
21331 }
21332 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21333 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21334 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
21335
21336 }
21337 else {
21338 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
21339 uint64_t totalSize = 0;
21340 uint64_t locPageSize = initPageSize;
21341 locBufferNum = app->configuration.outputBufferNum;
21343 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
21344 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
21345 totalSize += app->configuration.outputBufferSize[i];
21346 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
21347 }
21348 }
21349 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21350 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21351 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21352
21353 }
21354 else {
21355 uint64_t totalSize = 0;
21356 uint64_t locPageSize = initPageSize;
21357 locBufferNum = app->configuration.bufferNum;
21358 if (app->configuration.bufferSize) {
21359 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
21360 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
21361 totalSize += app->configuration.bufferSize[i];
21362 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
21363
21364 }
21365 }
21366 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21367 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21368 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
21369
21370 }
21371 }
21372 }
21373 else {
21374 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
21375 ((axis_id == app->firstAxis) && (inverse))
21376 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
21377 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
21378 )) ||
21379 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
21380 ((axis_id == app->firstAxis) && (inverse))
21381 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
21382 )) ||
21383 ((app->configuration.numberKernels > 1) && (
21384 (inverse)
21385 || (axis_id == app->lastAxis)))
21386 ) {
21387 uint64_t totalSize = 0;
21388 uint64_t locPageSize = initPageSize;
21389 locBufferNum = app->configuration.outputBufferNum;
21391 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
21392 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
21393 totalSize += app->configuration.outputBufferSize[i];
21394 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
21395 }
21396 }
21397 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21398 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21399 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21400
21401 }
21402 else {
21403 uint64_t totalSize = 0;
21404 uint64_t locPageSize = initPageSize;
21405
21406 locBufferNum = app->configuration.bufferNum;
21407 if (app->configuration.bufferSize) {
21408 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
21409 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
21410 totalSize += app->configuration.bufferSize[i];
21411 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
21412 }
21413 }
21414 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21415 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21416 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21417
21418 }
21419 }
21420 }
21421 initPageSize = -1;
21422 locBufferNum = 1;
21423 locBufferSize = -1;
21424 {
21425 if (inverse) {
21426 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
21427 uint64_t totalSize = 0;
21428 uint64_t locPageSize = initPageSize;
21429 locBufferNum = app->configuration.outputBufferNum;
21431 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
21432 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
21433 totalSize += app->configuration.outputBufferSize[i];
21434 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
21435 }
21436 }
21437 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21438 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21439 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21440
21441 }
21442 else {
21443 uint64_t totalSize = 0;
21444 uint64_t locPageSize = initPageSize;
21445 locBufferNum = app->configuration.bufferNum;
21446 if (app->configuration.bufferSize) {
21447 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
21448 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
21449 totalSize += app->configuration.bufferSize[i];
21450 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
21451
21452 }
21453 }
21454 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21455 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21456 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21457
21458 }
21459 }
21460 else {
21461 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
21462 ((axis_id == app->firstAxis) && (inverse))
21463 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
21464 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
21465 )) ||
21466 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
21467 ((axis_id == app->firstAxis) && (inverse))
21468 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
21469 )) ||
21470 ((app->configuration.numberKernels > 1) && (
21471 (inverse)
21472 || (axis_id == app->lastAxis)))
21473 ) {
21474 uint64_t totalSize = 0;
21475 uint64_t locPageSize = initPageSize;
21476 locBufferNum = app->configuration.outputBufferNum;
21478 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
21479 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
21480 totalSize += app->configuration.outputBufferSize[i];
21481 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
21482 }
21483 }
21484 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21485 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21486 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21487
21488 }
21489 else {
21490 uint64_t totalSize = 0;
21491 uint64_t locPageSize = initPageSize;
21492
21493 locBufferNum = app->configuration.bufferNum;
21494 if (app->configuration.bufferSize) {
21495 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
21496 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
21497 totalSize += app->configuration.bufferSize[i];
21498 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
21499 }
21500 }
21501 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21502 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21503 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
21504
21505 }
21506 }
21507 }
21508
21509 if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
21510 if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
21512 //need fixing (not used now)
21513 uint64_t totalSize = 0;
21514 uint64_t locPageSize = initPageSize;
21515 if (app->configuration.kernelSize) {
21516 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
21517 totalSize += app->configuration.kernelSize[i];
21518 if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i];
21519 }
21520 }
21521 axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (double)storageComplexSize);
21522 axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
21523 //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
21524 if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
21525 }
21526 else {
21527 axis->specializationConstants.kernelBlockSize = 0;
21528 axis->specializationConstants.kernelBlockNum = 0;
21529 }
21530 axis->numBindings = 2;
21531 axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
21532 axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
21533 axis->specializationConstants.numBuffersBound[2] = 0;
21534 axis->specializationConstants.numBuffersBound[3] = 0;
21535
21536#if(VKFFT_BACKEND==0)
21537 VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
21538 descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]);
21539#endif
21540 if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) {
21541 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
21542#if(VKFFT_BACKEND==0)
21543 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
21544#endif
21545 axis->numBindings++;
21546 }
21547
21548 if (app->configuration.useLUT) {
21549 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
21550#if(VKFFT_BACKEND==0)
21551 descriptorPoolSize.descriptorCount++;
21552#endif
21553 axis->numBindings++;
21554 }
21555#if(VKFFT_BACKEND==0)
21556 VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
21557 descriptorPoolCreateInfo.poolSizeCount = 1;
21558 descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
21559 descriptorPoolCreateInfo.maxSets = 1;
21560 res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
21561 if (res != VK_SUCCESS) {
21562 deleteVkFFT(app);
21564 }
21565 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
21566 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
21567 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding));
21568 if (!descriptorSetLayoutBindings) {
21569 deleteVkFFT(app);
21571 }
21572 for (uint64_t i = 0; i < axis->numBindings; ++i) {
21573 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
21574 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
21575 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
21576 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
21577 }
21578
21579 VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
21580 descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
21581 descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
21582
21583 res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
21584 if (res != VK_SUCCESS) {
21585 deleteVkFFT(app);
21587 }
21588 free(descriptorSetLayoutBindings);
21589 descriptorSetLayoutBindings = 0;
21590 VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
21591 descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
21592 descriptorSetAllocateInfo.descriptorSetCount = 1;
21593 descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
21594 res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
21595 if (res != VK_SUCCESS) {
21596 deleteVkFFT(app);
21598 }
21599#endif
21600 resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0);
21601 if (resFFT != VKFFT_SUCCESS) {
21602 deleteVkFFT(app);
21603 return resFFT;
21604 }
21605 resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, FFTPlan, axis, axis_id, axis_upload_id, inverse);
21606 if (resFFT != VKFFT_SUCCESS) {
21607 deleteVkFFT(app);
21608 return resFFT;
21609 }
21610 {
21611#if(VKFFT_BACKEND==0)
21612 VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
21613 pipelineLayoutCreateInfo.setLayoutCount = 1;
21614 pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
21615
21616 VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
21617 pushConstantRange.offset = 0;
21618 pushConstantRange.size = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
21619 // Push constant ranges are part of the pipeline layout
21620 pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
21621 pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
21622
21623 res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
21624 if (res != VK_SUCCESS) {
21625 deleteVkFFT(app);
21627 }
21628#endif
21629 axis->axisBlock[0] = 128;
21630 if (axis->axisBlock[0] > app->configuration.maxThreadsNum) axis->axisBlock[0] = app->configuration.maxThreadsNum;
21631 axis->axisBlock[1] = 1;
21632 axis->axisBlock[2] = 1;
21633
21634 uint64_t tempSize[3] = { (uint64_t)ceil((app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])), 1, 1 };
21636
21637 if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
21638 else axis->specializationConstants.performWorkGroupShift[0] = 0;
21639 if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
21640 else axis->specializationConstants.performWorkGroupShift[1] = 0;
21641 if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
21642 else axis->specializationConstants.performWorkGroupShift[2] = 0;
21643
21644 axis->specializationConstants.localSize[0] = axis->axisBlock[0];
21645 axis->specializationConstants.localSize[1] = axis->axisBlock[1];
21646 axis->specializationConstants.localSize[2] = axis->axisBlock[2];
21647
21648 axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures;
21649 axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution;
21650 axis->specializationConstants.size[0] = app->configuration.size[0];
21651 axis->specializationConstants.size[1] = app->configuration.size[1];
21652 axis->specializationConstants.size[2] = app->configuration.size[2];
21653
21654 axis->specializationConstants.numBatches = app->configuration.numberBatches;
21655 if ((app->configuration.FFTdim == 1) && (app->configuration.size[1] == 1) && ((app->configuration.numberBatches == 1) && (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) {
21656 axis->specializationConstants.numBatches = app->actualNumBatches;
21657 }
21658
21659 axis->specializationConstants.numKernels = app->configuration.numberKernels;
21660 axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize;
21661 axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2;
21662 axis->specializationConstants.normalize = app->configuration.normalize;
21663 axis->specializationConstants.axis_id = 0;
21664 axis->specializationConstants.axis_upload_id = 0;
21665
21666 for (uint64_t i = 0; i < 3; i++) {
21667 axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding;
21668 axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on)
21669 axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i];
21670 axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i];
21671 }
21672 if ((inverse)) {
21673 if ((app->configuration.frequencyZeroPadding) && (((!app->configuration.reorderFourStep) && (axis_upload_id == 0)) || ((app->configuration.reorderFourStep) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)))) {
21674 axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
21675 axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
21676 axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
21677 }
21678 else
21679 axis->specializationConstants.zeropad[0] = 0;
21680 if ((!app->configuration.frequencyZeroPadding) && (((!app->configuration.reorderFourStep) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || ((app->configuration.reorderFourStep) && (axis_upload_id == 0)))) {
21681 axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
21682 axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
21683 axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
21684 }
21685 else
21686 axis->specializationConstants.zeropad[1] = 0;
21687 }
21688 else {
21689 if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) {
21690 axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
21691 axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
21692 axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
21693 }
21694 else
21695 axis->specializationConstants.zeropad[0] = 0;
21696 if (((app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) || (((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)))) {
21697 axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
21698 axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
21699 axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
21700 }
21701 else
21702 axis->specializationConstants.zeropad[1] = 0;
21703 }
21704 if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) {
21705 axis->specializationConstants.convolutionStep = 1;
21706 }
21707 else
21708 axis->specializationConstants.convolutionStep = 0;
21709 char floatTypeInputMemory[10];
21710 char floatTypeOutputMemory[10];
21711 char floatTypeKernelMemory[10];
21712 char floatType[10];
21713 axis->specializationConstants.unroll = 1;
21714 axis->specializationConstants.LUT = app->configuration.useLUT;
21715 if (app->configuration.doublePrecision) {
21716 sprintf(floatType, "double");
21717 sprintf(floatTypeInputMemory, "double");
21718 sprintf(floatTypeOutputMemory, "double");
21719 sprintf(floatTypeKernelMemory, "double");
21720 //axis->specializationConstants.unroll = 1;
21721 }
21722 else {
21723 //axis->specializationConstants.unroll = 0;
21724 if (app->configuration.halfPrecision) {
21725 sprintf(floatType, "float");
21727 //only out of place mode, input/output buffer must be different
21728 sprintf(floatTypeInputMemory, "float");
21729 sprintf(floatTypeOutputMemory, "float");
21730 sprintf(floatTypeKernelMemory, "float");
21731 }
21732 else {
21733 sprintf(floatTypeInputMemory, "half");
21734 sprintf(floatTypeOutputMemory, "half");
21735 sprintf(floatTypeKernelMemory, "half");
21736 }
21737
21738 }
21739 else {
21741 sprintf(floatType, "double");
21742 sprintf(floatTypeInputMemory, "float");
21743 sprintf(floatTypeOutputMemory, "float");
21744 sprintf(floatTypeKernelMemory, "float");
21745 }
21746 else {
21747 sprintf(floatType, "float");
21748 sprintf(floatTypeInputMemory, "float");
21749 sprintf(floatTypeOutputMemory, "float");
21750 sprintf(floatTypeKernelMemory, "float");
21751 }
21752 }
21753 }
21754 char uintType[20] = "";
21755 if (!app->configuration.useUint64) {
21756#if(VKFFT_BACKEND==0)
21757 sprintf(uintType, "uint");
21758#elif(VKFFT_BACKEND==1)
21759 sprintf(uintType, "unsigned int");
21760#elif(VKFFT_BACKEND==2)
21761 sprintf(uintType, "unsigned int");
21762#elif(VKFFT_BACKEND==3)
21763 sprintf(uintType, "unsigned int");
21764#endif
21765 }
21766 else {
21767#if(VKFFT_BACKEND==0)
21768 sprintf(uintType, "uint64_t");
21769#elif(VKFFT_BACKEND==1)
21770 sprintf(uintType, "unsigned long long");
21771#elif(VKFFT_BACKEND==2)
21772 sprintf(uintType, "unsigned long long");
21773#elif(VKFFT_BACKEND==3)
21774 sprintf(uintType, "unsigned long");
21775#endif
21776 }
21777 //uint64_t LUT = app->configuration.useLUT;
21778 uint64_t type = 0;
21779
21780 axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength;
21781 axis->specializationConstants.maxTempLength = app->configuration.maxTempLength;
21782 axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength);
21783 char* code0 = axis->specializationConstants.code0;
21784 if (!code0) {
21785 deleteVkFFT(app);
21787 }
21788 resFFT = shaderGenVkFFT_R2C_decomposition(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
21789 freeShaderGenVkFFT(&axis->specializationConstants);
21790 if (resFFT != VKFFT_SUCCESS) {
21791 deleteVkFFT(app);
21792 return resFFT;
21793 }
21794#if(VKFFT_BACKEND==0)
21795 const glslang_resource_t default_resource = {
21796 /* .MaxLights = */ 32,
21797 /* .MaxClipPlanes = */ 6,
21798 /* .MaxTextureUnits = */ 32,
21799 /* .MaxTextureCoords = */ 32,
21800 /* .MaxVertexAttribs = */ 64,
21801 /* .MaxVertexUniformComponents = */ 4096,
21802 /* .MaxVaryingFloats = */ 64,
21803 /* .MaxVertexTextureImageUnits = */ 32,
21804 /* .MaxCombinedTextureImageUnits = */ 80,
21805 /* .MaxTextureImageUnits = */ 32,
21806 /* .MaxFragmentUniformComponents = */ 4096,
21807 /* .MaxDrawBuffers = */ 32,
21808 /* .MaxVertexUniformVectors = */ 128,
21809 /* .MaxVaryingVectors = */ 8,
21810 /* .MaxFragmentUniformVectors = */ 16,
21811 /* .MaxVertexOutputVectors = */ 16,
21812 /* .MaxFragmentInputVectors = */ 15,
21813 /* .MinProgramTexelOffset = */ -8,
21814 /* .MaxProgramTexelOffset = */ 7,
21815 /* .MaxClipDistances = */ 8,
21816 /* .MaxComputeWorkGroupCountX = */ 65535,
21817 /* .MaxComputeWorkGroupCountY = */ 65535,
21818 /* .MaxComputeWorkGroupCountZ = */ 65535,
21819 /* .MaxComputeWorkGroupSizeX = */ 1024,
21820 /* .MaxComputeWorkGroupSizeY = */ 1024,
21821 /* .MaxComputeWorkGroupSizeZ = */ 64,
21822 /* .MaxComputeUniformComponents = */ 1024,
21823 /* .MaxComputeTextureImageUnits = */ 16,
21824 /* .MaxComputeImageUniforms = */ 8,
21825 /* .MaxComputeAtomicCounters = */ 8,
21826 /* .MaxComputeAtomicCounterBuffers = */ 1,
21827 /* .MaxVaryingComponents = */ 60,
21828 /* .MaxVertexOutputComponents = */ 64,
21829 /* .MaxGeometryInputComponents = */ 64,
21830 /* .MaxGeometryOutputComponents = */ 128,
21831 /* .MaxFragmentInputComponents = */ 128,
21832 /* .MaxImageUnits = */ 8,
21833 /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8,
21834 /* .MaxCombinedShaderOutputResources = */ 8,
21835 /* .MaxImageSamples = */ 0,
21836 /* .MaxVertexImageUniforms = */ 0,
21837 /* .MaxTessControlImageUniforms = */ 0,
21838 /* .MaxTessEvaluationImageUniforms = */ 0,
21839 /* .MaxGeometryImageUniforms = */ 0,
21840 /* .MaxFragmentImageUniforms = */ 8,
21841 /* .MaxCombinedImageUniforms = */ 8,
21842 /* .MaxGeometryTextureImageUnits = */ 16,
21843 /* .MaxGeometryOutputVertices = */ 256,
21844 /* .MaxGeometryTotalOutputComponents = */ 1024,
21845 /* .MaxGeometryUniformComponents = */ 1024,
21846 /* .MaxGeometryVaryingComponents = */ 64,
21847 /* .MaxTessControlInputComponents = */ 128,
21848 /* .MaxTessControlOutputComponents = */ 128,
21849 /* .MaxTessControlTextureImageUnits = */ 16,
21850 /* .MaxTessControlUniformComponents = */ 1024,
21851 /* .MaxTessControlTotalOutputComponents = */ 4096,
21852 /* .MaxTessEvaluationInputComponents = */ 128,
21853 /* .MaxTessEvaluationOutputComponents = */ 128,
21854 /* .MaxTessEvaluationTextureImageUnits = */ 16,
21855 /* .MaxTessEvaluationUniformComponents = */ 1024,
21856 /* .MaxTessPatchComponents = */ 120,
21857 /* .MaxPatchVertices = */ 32,
21858 /* .MaxTessGenLevel = */ 64,
21859 /* .MaxViewports = */ 16,
21860 /* .MaxVertexAtomicCounters = */ 0,
21861 /* .MaxTessControlAtomicCounters = */ 0,
21862 /* .MaxTessEvaluationAtomicCounters = */ 0,
21863 /* .MaxGeometryAtomicCounters = */ 0,
21864 /* .MaxFragmentAtomicCounters = */ 8,
21865 /* .MaxCombinedAtomicCounters = */ 8,
21866 /* .MaxAtomicCounterBindings = */ 1,
21867 /* .MaxVertexAtomicCounterBuffers = */ 0,
21868 /* .MaxTessControlAtomicCounterBuffers = */ 0,
21869 /* .MaxTessEvaluationAtomicCounterBuffers = */ 0,
21870 /* .MaxGeometryAtomicCounterBuffers = */ 0,
21871 /* .MaxFragmentAtomicCounterBuffers = */ 1,
21872 /* .MaxCombinedAtomicCounterBuffers = */ 1,
21873 /* .MaxAtomicCounterBufferSize = */ 16384,
21874 /* .MaxTransformFeedbackBuffers = */ 4,
21875 /* .MaxTransformFeedbackInterleavedComponents = */ 64,
21876 /* .MaxCullDistances = */ 8,
21877 /* .MaxCombinedClipAndCullDistances = */ 8,
21878 /* .MaxSamples = */ 4,
21879 /* .maxMeshOutputVerticesNV = */ 256,
21880 /* .maxMeshOutputPrimitivesNV = */ 512,
21881 /* .maxMeshWorkGroupSizeX_NV = */ 32,
21882 /* .maxMeshWorkGroupSizeY_NV = */ 1,
21883 /* .maxMeshWorkGroupSizeZ_NV = */ 1,
21884 /* .maxTaskWorkGroupSizeX_NV = */ 32,
21885 /* .maxTaskWorkGroupSizeY_NV = */ 1,
21886 /* .maxTaskWorkGroupSizeZ_NV = */ 1,
21887 /* .maxMeshViewCountNV = */ 4,
21888 /* .maxDualSourceDrawBuffersEXT = */ 1,
21889
21890 /* .limits = */ {
21891 /* .nonInductiveForLoops = */ 1,
21892 /* .whileLoops = */ 1,
21893 /* .doWhileLoops = */ 1,
21894 /* .generalUniformIndexing = */ 1,
21895 /* .generalAttributeMatrixVectorIndexing = */ 1,
21896 /* .generalVaryingIndexing = */ 1,
21897 /* .generalSamplerIndexing = */ 1,
21898 /* .generalVariableIndexing = */ 1,
21899 /* .generalConstantMatrixVectorIndexing = */ 1,
21900 } };
21901 glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
21902 glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
21903 const glslang_input_t input =
21904 {
21905 GLSLANG_SOURCE_GLSL,
21906 GLSLANG_STAGE_COMPUTE,
21907 GLSLANG_CLIENT_VULKAN,
21908 client_version,
21909 GLSLANG_TARGET_SPV,
21910 target_language_version,
21911 code0,
21912 450,
21913 GLSLANG_NO_PROFILE,
21914 1,
21915 0,
21916 GLSLANG_MSG_DEFAULT_BIT,
21917 &default_resource,
21918 };
21919 //printf("%s\n", code0);
21920 glslang_shader_t* shader = glslang_shader_create(&input);
21921 const char* err;
21922 if (!glslang_shader_preprocess(shader, &input))
21923 {
21924 err = glslang_shader_get_info_log(shader);
21925 printf("%s\n", code0);
21926 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
21927 glslang_shader_delete(shader);
21928 free(code0);
21929 code0 = 0;
21930 deleteVkFFT(app);
21932
21933 }
21934
21935 if (!glslang_shader_parse(shader, &input))
21936 {
21937 err = glslang_shader_get_info_log(shader);
21938 printf("%s\n", code0);
21939 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
21940 glslang_shader_delete(shader);
21941 free(code0);
21942 code0 = 0;
21943 deleteVkFFT(app);
21945
21946 }
21947 glslang_program_t* program = glslang_program_create();
21948 glslang_program_add_shader(program, shader);
21949 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
21950 {
21951 err = glslang_program_get_info_log(program);
21952 printf("%s\n", code0);
21953 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
21954 glslang_shader_delete(shader);
21955 glslang_program_delete(program);
21956 free(code0);
21957 code0 = 0;
21958 deleteVkFFT(app);
21960
21961 }
21962
21963 //TODO: fix compilation errors
21964 //glslang_program_SPIRV_generate(program, input.stage);
21965
21966 //TODO: fix compilation errors
21967// if (glslang_program_SPIRV_get_messages(program))
21968// {
21969// printf("%s", glslang_program_SPIRV_get_messages(program));
21970// glslang_shader_delete(shader);
21971// glslang_program_delete(program);
21972// free(code0);
21973// code0 = 0;
21974// deleteVkFFT(app);
21975// return VKFFT_ERROR_FAILED_SPIRV_GENERATE;
21976// }
21977
21978 glslang_shader_delete(shader);
21979 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
21980 VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
21981 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
21982 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
21983 //TODO: fix compilation errors
21984// createInfo.pCode = glslang_program_SPIRV_get_ptr(program);
21985// createInfo.codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t);
21986 res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
21987 if (res != VK_SUCCESS) {
21988 glslang_program_delete(program);
21989 free(code0);
21990 code0 = 0;
21991 deleteVkFFT(app);
21993 }
21994 pipelineShaderStageCreateInfo.pName = "main";
21995 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo;
21996 computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
21997 computePipelineCreateInfo.layout = axis->pipelineLayout;
21998 res = vkCreateComputePipelines(app->configuration.device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
21999 if (res != VK_SUCCESS) {
22000 deleteVkFFT(app);
22002 }
22003 vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0);
22004 glslang_program_delete(program);
22005#elif(VKFFT_BACKEND==1)
22006 nvrtcProgram prog;
22007 nvrtcResult result = nvrtcCreateProgram(&prog, // prog
22008 code0, // buffer
22009 "VkFFT.cu", // name
22010 0, // numHeaders
22011 0, // headers
22012 0); // includeNames
22013 //free(includeNames);
22014 //free(headers);
22015 if (result != NVRTC_SUCCESS) {
22016 printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
22017 free(code0);
22018 code0 = 0;
22019 deleteVkFFT(app);
22021 }
22022 //const char opts[20] = "--fmad=false";
22023 //result = nvrtcAddNameExpression(prog, "&consts");
22024 //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result));
22025 result = nvrtcCompileProgram(prog, // prog
22026 0, // numOptions
22027 0); // options
22028 if (result != NVRTC_SUCCESS) {
22029 printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
22030 char* log = (char*)malloc(sizeof(char) * 1000000);
22031 if (!log) {
22032 free(code0);
22033 code0 = 0;
22034 deleteVkFFT(app);
22036 }
22037 else {
22038 nvrtcGetProgramLog(prog, log);
22039 printf("%s\n", log);
22040 free(log);
22041 log = 0;
22042 printf("%s\n", code0);
22043 free(code0);
22044 code0 = 0;
22045 deleteVkFFT(app);
22047 }
22048 }
22049 size_t ptxSize;
22050 result = nvrtcGetPTXSize(prog, &ptxSize);
22051 if (result != NVRTC_SUCCESS) {
22052 printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
22053 free(code0);
22054 code0 = 0;
22055 deleteVkFFT(app);
22057 }
22058 char* ptx = (char*)malloc(ptxSize);
22059 if (!ptx) {
22060 free(code0);
22061 code0 = 0;
22062 deleteVkFFT(app);
22064 }
22065 result = nvrtcGetPTX(prog, ptx);
22066 if (result != NVRTC_SUCCESS) {
22067 printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
22068 free(ptx);
22069 ptx = 0;
22070 free(code0);
22071 code0 = 0;
22072 deleteVkFFT(app);
22074 }
22075 result = nvrtcDestroyProgram(&prog);
22076 if (result != NVRTC_SUCCESS) {
22077 printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
22078 free(ptx);
22079 ptx = 0;
22080 free(code0);
22081 code0 = 0;
22082 deleteVkFFT(app);
22084 }
22085
22086 CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
22087
22088 if (result2 != CUDA_SUCCESS) {
22089 printf("cuModuleLoadDataEx error: %d\n", result2);
22090 free(ptx);
22091 ptx = 0;
22092 free(code0);
22093 code0 = 0;
22094 deleteVkFFT(app);
22096 }
22097 result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C");
22098 if (result2 != CUDA_SUCCESS) {
22099 printf("cuModuleGetFunction error: %d\n", result2);
22100 free(ptx);
22101 ptx = 0;
22102 free(code0);
22103 code0 = 0;
22104 deleteVkFFT(app);
22106 }
22107 if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
22108 result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory);
22109 if (result2 != CUDA_SUCCESS) {
22110 printf("cuFuncSetAttribute error: %d\n", result2);
22111 free(ptx);
22112 ptx = 0;
22113 free(code0);
22114 code0 = 0;
22115 deleteVkFFT(app);
22117 }
22118 }
22120 result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
22121 if (result2 != CUDA_SUCCESS) {
22122 printf("cuModuleGetGlobal error: %d\n", result2);
22123 free(ptx);
22124 ptx = 0;
22125 free(code0);
22126 code0 = 0;
22127 deleteVkFFT(app);
22129 }
22130 free(ptx);
22131 ptx = 0;
22132#elif(VKFFT_BACKEND==2)
22133 hiprtcProgram prog;
22134 /*char* includeNames = (char*)malloc(sizeof(char)*100);
22135 char* headers = (char*)malloc(sizeof(char) * 100);
22136 sprintf(headers, "C://Program Files//NVIDIA GPU Computing Toolkit//CUDA//v11.1//include//cuComplex.h");
22137 sprintf(includeNames, "cuComplex.h");*/
22138 enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog
22139 code0, // buffer
22140 "VkFFT.hip", // name
22141 0, // numHeaders
22142 0, // headers
22143 0); // includeNames
22144 if (result != HIPRTC_SUCCESS) {
22145 printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
22146 free(code0);
22147 code0 = 0;
22148 deleteVkFFT(app);
22150 }
22151
22152 result = hiprtcAddNameExpression(prog, "&consts");
22153 if (result != HIPRTC_SUCCESS) {
22154 printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
22155 free(code0);
22156 code0 = 0;
22157 deleteVkFFT(app);
22159 }
22160
22161 result = hiprtcCompileProgram(prog, // prog
22162 0, // numOptions
22163 0); // options
22164 if (result != HIPRTC_SUCCESS) {
22165 printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
22166 char* log = (char*)malloc(sizeof(char) * 100000);
22167 if (!log) {
22168 free(code0);
22169 code0 = 0;
22170 deleteVkFFT(app);
22172 }
22173 else {
22174 hiprtcGetProgramLog(prog, log);
22175 printf("%s\n", log);
22176 free(log);
22177 log = 0;
22178 printf("%s\n", code0);
22179 free(code0);
22180 code0 = 0;
22181 deleteVkFFT(app);
22183 }
22184 }
22185 size_t codeSize;
22186 result = hiprtcGetCodeSize(prog, &codeSize);
22187 if (result != HIPRTC_SUCCESS) {
22188 printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
22189 free(code0);
22190 code0 = 0;
22191 deleteVkFFT(app);
22193 }
22194 char* code = (char*)malloc(codeSize);
22195 if (!code) {
22196 free(code0);
22197 code0 = 0;
22198 deleteVkFFT(app);
22200 }
22201 result = hiprtcGetCode(prog, code);
22202 if (result != HIPRTC_SUCCESS) {
22203 printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
22204 free(code);
22205 code = 0;
22206 free(code0);
22207 code0 = 0;
22208 deleteVkFFT(app);
22210 }
22211 //printf("%s\n", code);
22212 // Destroy the program.
22213 result = hiprtcDestroyProgram(&prog);
22214 if (result != HIPRTC_SUCCESS) {
22215 printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
22216 free(code);
22217 code = 0;
22218 free(code0);
22219 code0 = 0;
22220 deleteVkFFT(app);
22222 }
22223 hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
22224
22225 if (result2 != hipSuccess) {
22226 printf("hipModuleLoadDataEx error: %d\n", result2);
22227 free(code);
22228 code = 0;
22229 free(code0);
22230 code0 = 0;
22231 deleteVkFFT(app);
22233 }
22234 result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C");
22235 if (result2 != hipSuccess) {
22236 printf("hipModuleGetFunction error: %d\n", result2);
22237 free(code);
22238 code = 0;
22239 free(code0);
22240 code0 = 0;
22241 deleteVkFFT(app);
22243 }
22244 if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
22245 result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory);
22246 //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared);
22247 if (result2 != hipSuccess) {
22248 printf("hipFuncSetAttribute error: %d\n", result2);
22249 free(code);
22250 code = 0;
22251 free(code0);
22252 code0 = 0;
22253 deleteVkFFT(app);
22255 }
22256 }
22258 result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
22259 if (result2 != hipSuccess) {
22260 printf("hipModuleGetGlobal error: %d\n", result2);
22261 free(code);
22262 code = 0;
22263 free(code0);
22264 code0 = 0;
22265 deleteVkFFT(app);
22267 }
22268
22269 free(code);
22270 code = 0;
22271#elif(VKFFT_BACKEND==3)
22272 size_t codelen = strlen(code0);
22273 axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&code0, &codelen, &res);
22274 if (res != CL_SUCCESS) {
22275 free(code0);
22276 code0 = 0;
22277 deleteVkFFT(app);
22279 }
22280 res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0);
22281 if (res != CL_SUCCESS) {
22282 size_t log_size;
22283 clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
22284 char* log = (char*)malloc(log_size);
22285 if (!log) {
22286 free(code0);
22287 code0 = 0;
22288 deleteVkFFT(app);
22290 }
22291 else {
22292 clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
22293 printf("%s\n", log);
22294 free(log);
22295 log = 0;
22296 printf("%s\n", code0);
22297 free(code0);
22298 code0 = 0;
22299 deleteVkFFT(app);
22301 }
22302 }
22303 axis->kernel = clCreateKernel(axis->program, "VkFFT_main_R2C", &res);
22304 if (res != CL_SUCCESS) {
22305 free(code0);
22306 code0 = 0;
22307 deleteVkFFT(app);
22309 }
22310#endif
22311 if (!app->configuration.keepShaderCode) {
22312 free(code0);
22313 code0 = 0;
22314 axis->specializationConstants.code0 = 0;
22315 }
22316 }
22317 return resFFT;
22318}
22319static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload) {
22320 //get radix stages
22321 VkFFTResult resFFT = VKFFT_SUCCESS;
22322#if(VKFFT_BACKEND==0)
22323 VkResult res = VK_SUCCESS;
22324#elif(VKFFT_BACKEND==1)
22325 cudaError_t res = cudaSuccess;
22326#elif(VKFFT_BACKEND==2)
22327 hipError_t res = hipSuccess;
22328#elif(VKFFT_BACKEND==3)
22329 cl_int res = CL_SUCCESS;
22330#endif
22331 VkFFTAxis* axis = (reverseBluesteinMultiUpload) ? &FFTPlan->inverseBluesteinAxes[axis_id][axis_upload_id] : &FFTPlan->axes[axis_id][axis_upload_id];
22332
22333 axis->specializationConstants.sourceFFTSize = app->configuration.size[axis_id];
22334 axis->specializationConstants.numBatches = app->configuration.numberBatches;
22335 if ((app->configuration.FFTdim == 1) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] == 1) && ((app->configuration.numberBatches > 1) || (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) {
22336 if (app->configuration.numberBatches > 1) {
22339 }
22340 FFTPlan->actualFFTSizePerAxis[axis_id][1] = app->actualNumBatches;
22341 }
22342
22343 axis->specializationConstants.warpSize = app->configuration.warpSize;
22344 axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks;
22345 axis->specializationConstants.useUint64 = app->configuration.useUint64;
22346 axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[axis_id];
22347 uint64_t complexSize;
22349 complexSize = (2 * sizeof(double));
22350 else
22352 complexSize = (2 * sizeof(float));
22353 else
22354 complexSize = (2 * sizeof(float));
22355 axis->specializationConstants.complexSize = complexSize;
22356 axis->specializationConstants.supportAxis = 0;
22357 axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel;
22358 axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution;
22359 axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization;
22360
22361 uint64_t maxSequenceLengthSharedMemory = app->configuration.sharedMemorySize / complexSize;
22362 uint64_t maxSequenceLengthSharedMemoryPow2 = app->configuration.sharedMemorySizePow2 / complexSize;
22363 uint64_t maxSingleSizeStrided = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySize / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySize / complexSize;
22364 uint64_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySizePow2 / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySizePow2 / complexSize;
22365
22366 axis->specializationConstants.stageStartSize = 1;
22367 for (uint64_t i = 0; i < axis_upload_id; i++)
22368 axis->specializationConstants.stageStartSize *= FFTPlan->axisSplit[axis_id][i];
22369
22370
22371 axis->specializationConstants.firstStageStartSize = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / FFTPlan->axisSplit[axis_id][FFTPlan->numAxisUploads[axis_id] - 1];
22372 axis->specializationConstants.dispatchZactualFFTSize = (axis_id < 2) ? FFTPlan->actualFFTSizePerAxis[axis_id][2] : FFTPlan->actualFFTSizePerAxis[axis_id][1];
22373 if (axis_id == 0) {
22374 //configure radix stages
22375 axis->specializationConstants.fft_dim_x = axis->specializationConstants.stageStartSize;
22376 }
22377 else {
22378 axis->specializationConstants.fft_dim_x = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22379 }
22380 if (app->useBluesteinFFT[axis_id]) {
22381 axis->specializationConstants.useBluesteinFFT = 1;
22382 }
22383
22384 if (app->configuration.performDCT == 3) {
22385 axis->specializationConstants.actualInverse = inverse;
22386 axis->specializationConstants.inverse = !inverse;
22387 }
22388 else {
22389 if (app->configuration.performDCT == 4) {
22390 axis->specializationConstants.actualInverse = inverse;
22391 axis->specializationConstants.inverse = 1;
22392 }
22393 else {
22394 axis->specializationConstants.actualInverse = inverse;
22395 axis->specializationConstants.inverse = inverse;
22396 }
22397 }
22398 if (app->useBluesteinFFT[axis_id]) {
22399 axis->specializationConstants.actualInverse = inverse;
22400 axis->specializationConstants.inverse = reverseBluesteinMultiUpload;
22401 if (app->configuration.performDCT == 3) {
22402 axis->specializationConstants.inverseBluestein = !inverse;
22403 }
22404 else {
22405 if (app->configuration.performDCT == 4) {
22406 axis->specializationConstants.inverseBluestein = 1;
22407 }
22408 else {
22409 axis->specializationConstants.inverseBluestein = inverse;
22410 }
22411 }
22412 }
22413 axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload;
22414
22415 axis->specializationConstants.reorderFourStep = ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->useBluesteinFFT[axis_id])) ? app->configuration.reorderFourStep : 0;
22416
22417 if ((axis_id == 0) && ((FFTPlan->numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) {
22418 maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost;
22419 maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory));
22420 }
22421 else {
22422 maxSingleSizeStrided *= axis->specializationConstants.registerBoost;
22423 maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided));
22424 }
22425
22426 axis->specializationConstants.performR2C = FFTPlan->actualPerformR2CPerAxis[axis_id];
22427 axis->specializationConstants.performR2CmultiUpload = FFTPlan->multiUploadR2C;
22428 if (app->configuration.performDCT == 3) {
22429 axis->specializationConstants.performDCT = 2;
22430 }
22431 else {
22432 axis->specializationConstants.performDCT = app->configuration.performDCT;
22433 }
22434 if ((axis->specializationConstants.performR2CmultiUpload) && (app->configuration.size[0] % 2 != 0)) return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C;
22435 axis->specializationConstants.mergeSequencesR2C = ((axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory) && ((FFTPlan->actualFFTSizePerAxis[axis_id][1] % 2) == 0) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || (((app->configuration.performDCT == 3) || (app->configuration.performDCT == 2) || (app->configuration.performDCT == 1) || ((app->configuration.performDCT == 4)&&((app->configuration.size[axis_id]%2) != 0))) && (axis_id == 0)))) ? (1 - app->configuration.disableMergeSequencesR2C) : 0;
22436 //uint64_t passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id;
22437 axis->specializationConstants.fft_dim_full = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
22438 if ((FFTPlan->numAxisUploads[axis_id] > 1) && (axis->specializationConstants.reorderFourStep || app->useBluesteinFFT[axis_id]) && (!app->configuration.userTempBuffer) && (app->configuration.allocateTempBuffer == 0)) {
22440
22441#if(VKFFT_BACKEND==0)
22442 app->configuration.tempBuffer = (VkBuffer*)malloc(sizeof(VkBuffer));
22443 if (!app->configuration.tempBuffer) {
22444 deleteVkFFT(app);
22446 }
22447 resFFT = allocateFFTBuffer(app, app->configuration.tempBuffer, &app->configuration.tempBufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, app->configuration.tempBufferSize[0]);
22448 if (resFFT != VKFFT_SUCCESS) {
22449 deleteVkFFT(app);
22450 return resFFT;
22451 }
22452#elif(VKFFT_BACKEND==1)
22453 app->configuration.tempBuffer = (void**)malloc(sizeof(void*));
22454 if (!app->configuration.tempBuffer) {
22455 deleteVkFFT(app);
22457 }
22458 res = cudaMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]);
22459 if (res != cudaSuccess) {
22460 deleteVkFFT(app);
22462 }
22463#elif(VKFFT_BACKEND==2)
22464 app->configuration.tempBuffer = (void**)malloc(sizeof(void*));
22465 if (!app->configuration.tempBuffer) {
22466 deleteVkFFT(app);
22468 }
22469 res = hipMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]);
22470 if (res != hipSuccess) {
22471 deleteVkFFT(app);
22473 }
22474#elif(VKFFT_BACKEND==3)
22475 app->configuration.tempBuffer = (cl_mem*)malloc(sizeof(cl_mem));
22476 if (!app->configuration.tempBuffer) {
22477 deleteVkFFT(app);
22479 }
22480 app->configuration.tempBuffer[0] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, app->configuration.tempBufferSize[0], 0, &res);
22481 if (res != CL_SUCCESS) {
22482 deleteVkFFT(app);
22484 }
22485#endif
22486 }
22487 //allocate LUT
22488 if (app->configuration.useLUT) {
22489 double double_PI = 3.1415926535897932384626433832795;
22490 uint64_t dimMult = 1;
22491 uint64_t maxStageSum = 0;
22492 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22493 switch (axis->specializationConstants.stageRadix[i]) {
22494 case 2:
22495 maxStageSum += dimMult;
22496 break;
22497 case 3:
22498 maxStageSum += dimMult * 2;
22499 break;
22500 case 4:
22501 maxStageSum += dimMult * 2;
22502 break;
22503 case 5:
22504 maxStageSum += dimMult * 4;
22505 break;
22506 case 7:
22507 maxStageSum += dimMult * 6;
22508 break;
22509 case 8:
22510 maxStageSum += dimMult * 3;
22511 break;
22512 case 11:
22513 maxStageSum += dimMult * 10;
22514 break;
22515 case 13:
22516 maxStageSum += dimMult * 12;
22517 break;
22518 }
22519 dimMult *= axis->specializationConstants.stageRadix[i];
22520 }
22521 axis->specializationConstants.maxStageSumLUT = maxStageSum;
22522 dimMult = 1;
22524 if (axis_upload_id > 0) {
22525 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22526 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22527 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double);
22528 }
22529 else {
22530 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22531 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22532 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2));
22533 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double);
22534 }
22535 else
22536 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(double);
22537 }
22538 }
22539 else {
22540 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22541 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22542 axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double);
22543 }
22544 else {
22545 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22546 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22547 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2));
22548 axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double);
22549
22550 }
22551 else
22552 axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(double);
22553 }
22554 }
22555 double* tempLUT = (double*)malloc(axis->bufferLUTSize);
22556 if (!tempLUT) {
22557 deleteVkFFT(app);
22559 }
22560 uint64_t localStageSize = 1;
22561 uint64_t localStageSum = 0;
22562 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22563 if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22564 for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
22565 for (uint64_t j = 0; j < localStageSize; j++) {
22566 tempLUT[2 * (j + localStageSum)] = cos(j * double_PI / localStageSize / pow(2, k));
22567 tempLUT[2 * (j + localStageSum) + 1] = sin(j * double_PI / localStageSize / pow(2, k));
22568 }
22569 localStageSum += localStageSize;
22570 }
22571 localStageSize *= axis->specializationConstants.stageRadix[i];
22572 }
22573 else {
22574 for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22575 for (uint64_t j = 0; j < localStageSize; j++) {
22576 tempLUT[2 * (j + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22577 tempLUT[2 * (j + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22578 }
22579 localStageSum += localStageSize;
22580 }
22581 localStageSize *= axis->specializationConstants.stageRadix[i];
22582 }
22583 }
22584
22585 if (axis_upload_id > 0) {
22586 for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
22587 for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
22588 double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
22589 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = cos(angle);
22590 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = sin(angle);
22591 }
22592 }
22593 }
22594 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22595 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) {
22596 double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id])) * j;
22597 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
22598 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
22599 }
22600 }
22601 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22602 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) {
22603 double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id] / 2)) * j;
22604 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
22605 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
22606 }
22607 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) {
22608 double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1);
22609 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = cos(angle);
22610 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = sin(angle);
22611 }
22612 }
22613 axis->referenceLUT = 0;
22614 if (reverseBluesteinMultiUpload == 1) {
22615 axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT;
22616#if(VKFFT_BACKEND==0)
22617 axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory;
22618#endif
22619 axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize;
22620 axis->referenceLUT = 1;
22621 }
22622 else {
22623 if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
22624 axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT;
22625#if(VKFFT_BACKEND==0)
22626 axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory;
22627#endif
22628 axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize;
22629 axis->referenceLUT = 1;
22630 }
22631 else {
22632 if (((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) {
22633 axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT;
22634#if(VKFFT_BACKEND==0)
22635 axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory;
22636#endif
22637 axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize;
22638 axis->referenceLUT = 1;
22639 }
22640 else {
22641 if ((axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) {
22642 axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT;
22643#if(VKFFT_BACKEND==0)
22644 axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory;
22645#endif
22646 axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize;
22647 axis->referenceLUT = 1;
22648 }
22649 else {
22650#if(VKFFT_BACKEND==0)
22651 resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
22652 if (resFFT != VKFFT_SUCCESS) {
22653 deleteVkFFT(app);
22654 free(tempLUT);
22655 tempLUT = 0;
22656 return resFFT;
22657 }
22658 resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
22659 if (resFFT != VKFFT_SUCCESS) {
22660 deleteVkFFT(app);
22661 free(tempLUT);
22662 tempLUT = 0;
22663 return resFFT;
22664 }
22665#elif(VKFFT_BACKEND==1)
22666 res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
22667 if (res != cudaSuccess) {
22668 deleteVkFFT(app);
22669 free(tempLUT);
22670 tempLUT = 0;
22672 }
22673 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
22674 if (res != cudaSuccess) {
22675 deleteVkFFT(app);
22676 free(tempLUT);
22677 tempLUT = 0;
22679 }
22680#elif(VKFFT_BACKEND==2)
22681 res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
22682 if (res != hipSuccess) {
22683 deleteVkFFT(app);
22684 free(tempLUT);
22685 tempLUT = 0;
22687 }
22688 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
22689 if (res != hipSuccess) {
22690 deleteVkFFT(app);
22691 free(tempLUT);
22692 tempLUT = 0;
22694 }
22695#elif(VKFFT_BACKEND==3)
22696 axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
22697 if (res != CL_SUCCESS) {
22698 deleteVkFFT(app);
22699 free(tempLUT);
22700 tempLUT = 0;
22702 }
22703#endif
22704 }
22705 }
22706 }
22707 }
22708 free(tempLUT);
22709 tempLUT = 0;
22710 }
22711 else {
22712 if (axis_upload_id > 0) {
22713 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22714 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22715 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float);
22716 }
22717 else {
22718 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22719 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22720 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (axis->specializationConstants.fftDim / 4 + 2));
22721 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float);
22722 }
22723 else
22724 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(float);
22725 }
22726 }
22727 else {
22728 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22729 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22730 axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float);
22731 }
22732 else {
22733 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22734 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22735 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2));
22736 axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float);
22737 }
22738 else
22739 axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(float);
22740 }
22741 }
22742 float* tempLUT = (float*)malloc(axis->bufferLUTSize);
22743 if (!tempLUT) {
22744 deleteVkFFT(app);
22746 }
22747 uint64_t localStageSize = 1;
22748 uint64_t localStageSum = 0;
22749 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22750 if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22751 for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
22752 for (uint64_t j = 0; j < localStageSize; j++) {
22753 tempLUT[2 * (j + localStageSum)] = (float)cos(j * double_PI / localStageSize / pow(2, k));
22754 tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k));
22755 }
22756 localStageSum += localStageSize;
22757 }
22758 localStageSize *= axis->specializationConstants.stageRadix[i];
22759 }
22760 else {
22761 for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22762 for (uint64_t j = 0; j < localStageSize; j++) {
22763 tempLUT[2 * (j + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22764 tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22765 }
22766 localStageSum += localStageSize;
22767 }
22768 localStageSize *= axis->specializationConstants.stageRadix[i];
22769 }
22770 }
22771
22772 if (axis_upload_id > 0) {
22773 for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
22774 for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
22775 double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
22776 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (float)cos(angle);
22777 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (float)sin(angle);
22778 }
22779 }
22780 }
22781 if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) {
22782 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) {
22783 double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id])) * j;
22784 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
22785 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
22786 }
22787 }
22788 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) {
22789 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) {
22790 double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id] / 2)) * j;
22791 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
22792 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
22793 }
22794 for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) {
22795 double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1);
22796 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (float)cos(angle);
22797 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)sin(angle);
22798 }
22799 }
22800 axis->referenceLUT = 0;
22801 if (reverseBluesteinMultiUpload == 1) {
22802 axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT;
22803#if(VKFFT_BACKEND==0)
22804 axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory;
22805#endif
22806 axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize;
22807 axis->referenceLUT = 1;
22808 }
22809 else {
22810 if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) {
22811 axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT;
22812#if(VKFFT_BACKEND==0)
22813 axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory;
22814#endif
22815 axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize;
22816 axis->referenceLUT = 1;
22817 }
22818 else {
22819 if (((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) {
22820 axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT;
22821#if(VKFFT_BACKEND==0)
22822 axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory;
22823#endif
22824 axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize;
22825 axis->referenceLUT = 1;
22826 }
22827 else {
22828 if ((axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) {
22829 axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT;
22830#if(VKFFT_BACKEND==0)
22831 axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory;
22832#endif
22833 axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize;
22834 axis->referenceLUT = 1;
22835 }
22836 else {
22837#if(VKFFT_BACKEND==0)
22838 resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
22839 if (resFFT != VKFFT_SUCCESS) {
22840 deleteVkFFT(app);
22841 free(tempLUT);
22842 tempLUT = 0;
22843 return resFFT;
22844 }
22845 resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
22846 if (resFFT != VKFFT_SUCCESS) {
22847 deleteVkFFT(app);
22848 free(tempLUT);
22849 tempLUT = 0;
22850 return resFFT;
22851 }
22852#elif(VKFFT_BACKEND==1)
22853 res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
22854 if (res != cudaSuccess) {
22855 deleteVkFFT(app);
22856 free(tempLUT);
22857 tempLUT = 0;
22859 }
22860 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
22861 if (res != cudaSuccess) {
22862 deleteVkFFT(app);
22863 free(tempLUT);
22864 tempLUT = 0;
22866 }
22867#elif(VKFFT_BACKEND==2)
22868 res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
22869 if (res != hipSuccess) {
22870 deleteVkFFT(app);
22871 free(tempLUT);
22872 tempLUT = 0;
22874 }
22875 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
22876 if (res != hipSuccess) {
22877 deleteVkFFT(app);
22878 free(tempLUT);
22879 tempLUT = 0;
22881 }
22882#elif(VKFFT_BACKEND==3)
22883 axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
22884 if (res != CL_SUCCESS) {
22885 deleteVkFFT(app);
22886 free(tempLUT);
22887 tempLUT = 0;
22889 }
22890#endif
22891 }
22892 }
22893 }
22894 }
22895 free(tempLUT);
22896 tempLUT = 0;
22897 }
22898 }
22899
22900 //configure strides
22901
22902 uint64_t* axisStride = axis->specializationConstants.inputStride;
22903 uint64_t* usedStride = app->configuration.bufferStride;
22904 if ((!inverse) && (axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride;
22905 if ((inverse) && (axis_id == app->lastAxis) && (((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reorderFourStep)) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep))) && (app->configuration.isInputFormatted) && (!app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride;
22906
22907 axisStride[0] = 1;
22908
22909 if (axis_id == 0) {
22910 axisStride[1] = usedStride[0];
22911 axisStride[2] = usedStride[1];
22912 }
22913 if (axis_id == 1)
22914 {
22915 axisStride[1] = usedStride[0];
22916 axisStride[2] = usedStride[1];
22917 }
22918 if (axis_id == 2)
22919 {
22920 axisStride[1] = usedStride[1];
22921 axisStride[2] = usedStride[0];
22922 }
22923
22924 axisStride[3] = usedStride[2];
22925
22926 axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures;
22927 if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)))) {
22928 axisStride[0] = 1;
22929
22930 if (axis_id == 0) {
22931 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22932 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
22933 }
22934 if (axis_id == 1)
22935 {
22936 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22937 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
22938 }
22939 if (axis_id == 2)
22940 {
22941 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
22942 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22943 }
22944
22945 axisStride[3] = axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2];
22946
22947 axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures;
22948 }
22949 if ((!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0) && (axis->specializationConstants.performR2C) && (!(app->configuration.isInputFormatted))) {
22950 axisStride[1] *= 2;
22951 axisStride[2] *= 2;
22952 axisStride[3] *= 2;
22953 axisStride[4] *= 2;
22954 }
22955 if ((FFTPlan->multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) {
22956 for (uint64_t i = 1; i < 5; i++) {
22957 axisStride[i] /= 2;
22958 }
22959 }
22960 axisStride = axis->specializationConstants.outputStride;
22961 usedStride = app->configuration.bufferStride;
22962 if ((!inverse) && (axis_id == app->lastAxis) && (axis_upload_id == 0) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride;
22963 if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (axis->specializationConstants.reorderFourStep)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.reorderFourStep))) && ((app->configuration.isOutputFormatted))) usedStride = app->configuration.outputBufferStride;
22964 if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (app->configuration.isInputFormatted)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.reorderFourStep))) && (app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride;
22965
22966 axisStride[0] = 1;
22967
22968 if (axis_id == 0) {
22969 axisStride[1] = usedStride[0];
22970 axisStride[2] = usedStride[1];
22971 }
22972 if (axis_id == 1)
22973 {
22974 axisStride[1] = usedStride[0];
22975 axisStride[2] = usedStride[1];
22976 }
22977 if (axis_id == 2)
22978 {
22979 axisStride[1] = usedStride[1];
22980 axisStride[2] = usedStride[0];
22981 }
22982
22983 axisStride[3] = usedStride[2];
22984
22985 axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures;
22986 if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 1)))) {
22987 axisStride[0] = 1;
22988
22989 if (axis_id == 0) {
22990 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22991 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
22992 }
22993 if (axis_id == 1)
22994 {
22995 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
22996 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
22997 }
22998 if (axis_id == 2)
22999 {
23000 axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1];
23001 axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
23002 }
23003
23004 axisStride[3] = axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2];
23005
23006 axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures;
23007 }
23008 if ((inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))) && (axis->specializationConstants.performR2C) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) {
23009 axisStride[1] *= 2;
23010 axisStride[2] *= 2;
23011 axisStride[3] *= 2;
23012 axisStride[4] *= 2;
23013 }
23014 if ((FFTPlan->multiUploadR2C) && (inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))))) {
23015 for (uint64_t i = 1; i < 5; i++) {
23016 axisStride[i] /= 2;
23017 }
23018 }
23019
23020 /*axis->specializationConstants.inputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.inputStride[3];
23021 axis->specializationConstants.outputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.outputStride[3];
23022
23023 axis->specializationConstants.inputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.inputStride[3] * app->configuration.coordinateFeatures;
23024 axis->specializationConstants.outputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.outputStride[3] * app->configuration.coordinateFeatures;
23025 */
23026
23027
23028 uint64_t storageComplexSize;
23030 storageComplexSize = (2 * sizeof(double));
23031 else
23033 storageComplexSize = (2 * 2);
23034 else
23035 storageComplexSize = (2 * sizeof(float));
23036
23037 uint64_t initPageSize = -1;
23038 uint64_t locBufferNum = 1;
23039 uint64_t locBufferSize = -1;
23040 /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
23041 initPageSize += app->configuration.bufferSize[i];
23042 }*/
23043 /*if (app->configuration.performConvolution) {
23044 uint64_t initPageSizeKernel = 0;
23045 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
23046 initPageSizeKernel += app->configuration.kernelSize[i];
23047 }
23048 if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel;
23049 }
23050 if (axis_id == 0) {
23051 if ((!((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
23052 initPageSize = app->configuration.localPageSize * 1024;
23053 }
23054 }
23055 if (axis_id == 1) {
23056 if ((app->configuration.bufferStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
23057 initPageSize = app->configuration.localPageSize * 1024;
23058 }
23059 }
23060 if (axis_id == 2) {
23061 if ((app->configuration.bufferStride[2] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
23062 initPageSize = app->configuration.localPageSize * 1024;
23063 }
23064 }
23065 */
23066 if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && (
23067 ((axis_id == app->firstAxis) && (!inverse))
23068 || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer)))
23069 ) {
23070 uint64_t totalSize = 0;
23071 uint64_t locPageSize = initPageSize;
23072 locBufferNum = app->configuration.inputBufferNum;
23073 if (app->configuration.inputBufferSize) {
23074 locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize);
23075 for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
23076 totalSize += app->configuration.inputBufferSize[i];
23077 if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i];
23078 }
23079 }
23080 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23081 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23082 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
23083
23084 }
23085 else {
23086 if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) {
23087 uint64_t totalSize = 0;
23088 uint64_t locPageSize = initPageSize;
23089 locBufferNum = app->configuration.outputBufferNum;
23091 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
23092 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
23093 totalSize += app->configuration.outputBufferSize[i];
23094 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
23095 }
23096 }
23097 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23098 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23099 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
23100
23101 }
23102 else {
23103 uint64_t totalSize = 0;
23104 uint64_t locPageSize = initPageSize;
23105 if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
23106 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) {
23107 locBufferNum = app->configuration.bufferNum;
23108 if (app->configuration.bufferSize) {
23109 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
23110 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
23111 totalSize += app->configuration.bufferSize[i];
23112 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
23113
23114 }
23115 }
23116 }
23117 else {
23118 locBufferNum = app->configuration.tempBufferNum;
23119 if (app->configuration.tempBufferSize) {
23120 locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize);
23121 for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
23122 totalSize += app->configuration.tempBufferSize[i];
23123 if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i];
23124
23125 }
23126 }
23127 }
23128 }
23129 else {
23130 locBufferNum = app->configuration.bufferNum;
23131 if (app->configuration.bufferSize) {
23132 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
23133 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
23134 totalSize += app->configuration.bufferSize[i];
23135 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
23136
23137 }
23138 }
23139 }
23140
23141 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23142 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23143 //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
23144
23145 }
23146 }
23147 initPageSize = -1;
23148 locBufferNum = 1;
23149 locBufferSize = -1;
23150 if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && (
23151 ((axis_id == app->firstAxis) && (inverse))
23152 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))
23153 || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)))
23154 )) ||
23155 ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && (
23156 ((axis_id == app->firstAxis) && (inverse))
23157 || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)))
23158 )) ||
23159 ((app->configuration.numberKernels > 1) && (
23160 (inverse)
23161 || (axis_id == app->lastAxis)))
23162 ) {
23163 uint64_t totalSize = 0;
23164 uint64_t locPageSize = initPageSize;
23165 locBufferNum = app->configuration.outputBufferNum;
23167 locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize);
23168 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
23169 totalSize += app->configuration.outputBufferSize[i];
23170 if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i];
23171 }
23172 }
23173 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23174 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23175 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
23176
23177 }
23178 else {
23179 uint64_t totalSize = 0;
23180 uint64_t locPageSize = initPageSize;
23181 if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) {
23182 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
23183 locBufferNum = app->configuration.tempBufferNum;
23184 if (app->configuration.tempBufferSize) {
23185 locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize);
23186 for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
23187 totalSize += app->configuration.tempBufferSize[i];
23188 if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i];
23189 }
23190 }
23191 }
23192 else {
23193 locBufferNum = app->configuration.bufferNum;
23194 if (app->configuration.bufferSize) {
23195 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
23196 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
23197 totalSize += app->configuration.bufferSize[i];
23198 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
23199 }
23200 }
23201 }
23202 }
23203 else {
23204 locBufferNum = app->configuration.bufferNum;
23205 if (app->configuration.bufferSize) {
23206 locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize);
23207 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
23208 totalSize += app->configuration.bufferSize[i];
23209 if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
23210 }
23211 }
23212 }
23213 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23214 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23215 //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
23216
23217 }
23218 if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
23219 if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
23221 uint64_t totalSize = 0;
23222 uint64_t locPageSize = initPageSize;
23223 locBufferNum = app->configuration.kernelNum;
23224 if (app->configuration.kernelSize) {
23225 locBufferSize = (uint64_t)ceil(app->configuration.kernelSize[0] / (double)storageComplexSize);
23226 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
23227 totalSize += app->configuration.kernelSize[i];
23228 if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i];
23229 }
23230 }
23231 axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize);
23232 axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
23233 //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
23234 if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
23235 }
23236 else {
23237 axis->specializationConstants.kernelBlockSize = 0;
23238 axis->specializationConstants.kernelBlockNum = 0;
23239 }
23240 axis->numBindings = 2;
23241 axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
23242 axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
23243 axis->specializationConstants.numBuffersBound[2] = 0;
23244 axis->specializationConstants.numBuffersBound[3] = 0;
23245#if(VKFFT_BACKEND==0)
23246 VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
23247 descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum);
23248#endif
23249 axis->specializationConstants.convolutionBindingID = -1;
23250 if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) {
23251 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23252 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23253#if(VKFFT_BACKEND==0)
23254 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23255#endif
23256 axis->numBindings++;
23257 }
23258 if ((axis_id == 1) && (axis_upload_id == 0) && (app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) {
23259 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23260 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23261#if(VKFFT_BACKEND==0)
23262 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23263#endif
23264 axis->numBindings++;
23265 }
23266 if ((axis_id == 2) && (axis_upload_id == 0) && (app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) {
23267 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23268 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23269#if(VKFFT_BACKEND==0)
23270 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23271#endif
23272 axis->numBindings++;
23273 }
23274 if (app->configuration.useLUT) {
23275 axis->specializationConstants.LUTBindingID = axis->numBindings;
23276 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23277#if(VKFFT_BACKEND==0)
23278 descriptorPoolSize.descriptorCount++;
23279#endif
23280 axis->numBindings++;
23281 }
23282 if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) {
23283 if (axis->specializationConstants.inverseBluestein)
23284 axis->bufferBluesteinFFT = &app->bufferBluesteinIFFT[axis_id];
23285 else
23286 axis->bufferBluesteinFFT = &app->bufferBluesteinFFT[axis_id];
23287 axis->specializationConstants.BluesteinConvolutionBindingID = axis->numBindings;
23288 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23289#if(VKFFT_BACKEND==0)
23290 descriptorPoolSize.descriptorCount++;
23291#endif
23292 axis->numBindings++;
23293 }
23294 if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) {
23295 axis->bufferBluestein = &app->bufferBluestein[axis_id];
23296 axis->specializationConstants.BluesteinMultiplicationBindingID = axis->numBindings;
23297 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23298#if(VKFFT_BACKEND==0)
23299 descriptorPoolSize.descriptorCount++;
23300#endif
23301 axis->numBindings++;
23302 }
23303#if(VKFFT_BACKEND==0)
23304 VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
23305 descriptorPoolCreateInfo.poolSizeCount = 1;
23306 descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
23307 descriptorPoolCreateInfo.maxSets = 1;
23308 res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
23309 if (res != VK_SUCCESS) {
23310 deleteVkFFT(app);
23312 }
23313 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
23314 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
23315 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding));
23316 if (!descriptorSetLayoutBindings) {
23317 deleteVkFFT(app);
23319 }
23320 for (uint64_t i = 0; i < axis->numBindings; ++i) {
23321 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
23322 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
23323 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
23324 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
23325 }
23326
23327 VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
23328 descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
23329 descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
23330
23331 res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
23332 if (res != VK_SUCCESS) {
23333 deleteVkFFT(app);
23335 }
23336 free(descriptorSetLayoutBindings);
23337 descriptorSetLayoutBindings = 0;
23338 VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
23339 descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
23340 descriptorSetAllocateInfo.descriptorSetCount = 1;
23341 descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
23342 res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
23343 if (res != VK_SUCCESS) {
23344 deleteVkFFT(app);
23346 }
23347#endif
23348 resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0);
23349 if (resFFT != VKFFT_SUCCESS) {
23350 deleteVkFFT(app);
23351 return resFFT;
23352 }
23353 resFFT = VkFFTUpdateBufferSet(app, FFTPlan, axis, axis_id, axis_upload_id, inverse);
23354 if (resFFT != VKFFT_SUCCESS) {
23355 deleteVkFFT(app);
23356 return resFFT;
23357 }
23358 {
23359#if(VKFFT_BACKEND==0)
23360 VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
23361 pipelineLayoutCreateInfo.setLayoutCount = 1;
23362 pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
23363
23364 VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
23365 pushConstantRange.offset = 0;
23366 pushConstantRange.size = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
23367 // Push constant ranges are part of the pipeline layout
23368 pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
23369 pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
23370
23371 res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
23372 if (res != VK_SUCCESS) {
23373 deleteVkFFT(app);
23375 }
23376#endif
23377 uint64_t maxBatchCoalesced = app->configuration.coalescedMemory / complexSize;
23378 axis->groupedBatch = maxBatchCoalesced;
23379 /*if ((FFTPlan->actualFFTSizePerAxis[axis_id][0] < 4096) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] < 512) && (FFTPlan->actualFFTSizePerAxis[axis_id][2] == 1)) {
23380 if (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) {
23381 if (1024 / axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim) {
23382 if (1024 / axis->specializationConstants.fftDim > axis->groupedBatch)
23383 axis->groupedBatch = 1024 / axis->specializationConstants.fftDim;
23384 else
23385 axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim;
23386 }
23387 }
23388 }
23389 else {
23390 axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
23391 }*/
23392 //if (axis->groupedBatch * (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0) < app->configuration.warpSize) axis->groupedBatch = app->configuration.warpSize / (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0);
23393 //axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
23394 if (((FFTPlan->numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) {
23395 axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim : axis->groupedBatch;
23396 }
23397 else {
23398 axis->groupedBatch = (maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch;
23399 }
23400 //axis->groupedBatch = 8;
23401 //shared memory bank conflict resolve
23402//#if(VKFFT_BACKEND!=2)//for some reason, hip doesn't get performance increase from having variable shared memory strides.
23403 if ((FFTPlan->numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
23404 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23405 }
23406 //#endif
23407 if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) {
23408 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23409 }
23410 if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced;
23411 axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
23412 //half bandiwdth technique
23413 if (!((axis_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && (axis->specializationConstants.fftDim > maxSingleSizeStrided)) {
23414 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23415 }
23416
23417 if ((app->configuration.halfThreads) && (axis->groupedBatch * axis->specializationConstants.fftDim * complexSize >= app->configuration.sharedMemorySize))
23418 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23419 if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize;
23420 if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23421 if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23422 uint64_t maxThreadNum = maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
23423 if (maxThreadNum > app->configuration.maxThreadsNum) maxThreadNum = app->configuration.maxThreadsNum;
23424 axis->specializationConstants.axisSwapped = 0;
23425 uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
23426 if (axis_id == 0) {
23427
23428 if (axis_upload_id == 0) {
23429 axis->axisBlock[0] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23430 if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum;
23431 if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
23432 if (axis->specializationConstants.reorderFourStep && (FFTPlan->numAxisUploads[axis_id] > 1))
23433 axis->axisBlock[1] = axis->groupedBatch;
23434 else {
23435 //axis->axisBlock[1] = (axis->axisBlock[0] < app->configuration.warpSize) ? app->configuration.warpSize / axis->axisBlock[0] : 1;
23436 axis->axisBlock[1] = ((axis->axisBlock[0] < app->configuration.aimThreads) && ((axis->axisBlock[0] < 32) || ((axis->axisBlock[0] & (axis->axisBlock[0] - 1)) != 0))) ? app->configuration.aimThreads / axis->axisBlock[0] : 1;
23437 }
23438 uint64_t currentAxisBlock1 = axis->axisBlock[1];
23439 for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
23440 if (((FFTPlan->numAxisUploads[0] > 1) && (((FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) % i) == 0)) || ((FFTPlan->numAxisUploads[0] == 1) && (((FFTPlan->actualFFTSizePerAxis[axis_id][1] / r2cmult) % i) == 0))) {
23441 if (i * axis->specializationConstants.fftDim * complexSize <= app->configuration.sharedMemorySize) axis->axisBlock[1] = i;
23442 i = 2 * currentAxisBlock1;
23443 }
23444 }
23445
23446 if ((FFTPlan->numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim);
23447 if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) {
23448 axis->specializationConstants.mergeSequencesR2C = 0;
23449 /*if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!(app->configuration.isInputFormatted))) {
23450 axis->specializationConstants.inputStride[1] /= 2;
23451 axis->specializationConstants.inputStride[2] /= 2;
23452 axis->specializationConstants.inputStride[3] /= 2;
23453 axis->specializationConstants.inputStride[4] /= 2;
23454 }
23455 if ((inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) {
23456 axis->specializationConstants.outputStride[1] /= 2;
23457 axis->specializationConstants.outputStride[2] /= 2;
23458 axis->specializationConstants.outputStride[3] /= 2;
23459 axis->specializationConstants.outputStride[4] /= 2;
23460 }*/
23461 r2cmult = 1;
23462 }
23463 if ((FFTPlan->numAxisUploads[0] == 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult);
23464
23465 if (axis->axisBlock[1] > app->configuration.maxComputeWorkGroupSize[1]) axis->axisBlock[1] = app->configuration.maxComputeWorkGroupSize[1];
23466 //if (axis->axisBlock[0] * axis->axisBlock[1] > app->configuration.maxThreadsNum) axis->axisBlock[1] /= 2;
23467 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23468 for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) {
23469 if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum)
23470 {
23471 axis->axisBlock[1] /= i;
23472 i = axis->axisBlock[1] + 1;
23473 }
23474
23475 }
23476 }
23477 while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2;
23478 if (((axis->specializationConstants.fftDim % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim < maxSequenceLengthSharedMemoryPow2) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
23479#if (VKFFT_BACKEND==0)
23480 if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) {
23481 uint64_t temp = axis->axisBlock[1];
23482 axis->axisBlock[1] = axis->axisBlock[0];
23483 axis->axisBlock[0] = temp;
23484 axis->specializationConstants.axisSwapped = 1;
23485 }
23486#else
23487 uint64_t temp = axis->axisBlock[1];
23488 axis->axisBlock[1] = axis->axisBlock[0];
23489 axis->axisBlock[0] = temp;
23490 axis->specializationConstants.axisSwapped = 1;
23491#endif
23492 }
23493 axis->axisBlock[2] = 1;
23494 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23495 }
23496 else {
23497 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23498 uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch;
23499 if (scale > 1) axis->groupedBatch *= scale;
23500 axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize;
23501 if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
23502 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23503 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23504 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23505 {
23506 axis->axisBlock[0] /= i;
23507 i = axis->axisBlock[0] + 1;
23508 }
23509
23510 }
23511 }
23512 axis->axisBlock[2] = 1;
23513 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23514 }
23515
23516 }
23517 if (axis_id == 1) {
23518
23519 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23520
23521 axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0];
23522 if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
23523 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23524 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23525 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23526 {
23527 axis->axisBlock[0] /= i;
23528 i = axis->axisBlock[0] + 1;
23529 }
23530
23531 }
23532 }
23533 axis->axisBlock[2] = 1;
23534 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23535
23536 }
23537 if (axis_id == 2) {
23538 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23539
23540 axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0];
23541
23542 if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
23543 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23544 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23545 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23546 {
23547 axis->axisBlock[0] /= i;
23548 i = axis->axisBlock[0] + 1;
23549 }
23550
23551 }
23552 }
23553 axis->axisBlock[2] = 1;
23554 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23555 }
23556
23557
23558
23559 /*VkSpecializationMapEntry specializationMapEntries[36] = { {} };
23560 for (uint64_t i = 0; i < 36; i++) {
23561 specializationMapEntries[i].constantID = i + 1;
23562 specializationMapEntries[i].size = sizeof(uint64_t);
23563 specializationMapEntries[i].offset = i * sizeof(uint64_t);
23564 }
23565 VkSpecializationInfo specializationInfo = { 0 };
23566 specializationInfo.dataSize = 36 * sizeof(uint64_t);
23567 specializationInfo.mapEntryCount = 36;
23568 specializationInfo.pMapEntries = specializationMapEntries;*/
23569 axis->specializationConstants.localSize[0] = axis->axisBlock[0];
23570 axis->specializationConstants.localSize[1] = axis->axisBlock[1];
23571 axis->specializationConstants.localSize[2] = axis->axisBlock[2];
23572 //specializationInfo.pData = &axis->specializationConstants;
23573 //uint64_t registerBoost = (FFTPlan->numAxisUploads[axis_id] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost;
23574
23575 axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures;
23576 axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution;
23577 axis->specializationConstants.numKernels = app->configuration.numberKernels;
23578 axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize;
23579 axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2;
23580 axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : app->configuration.normalize;
23581 axis->specializationConstants.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0];
23582 axis->specializationConstants.size[1] = FFTPlan->actualFFTSizePerAxis[axis_id][1];
23583 axis->specializationConstants.size[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2];
23584 axis->specializationConstants.axis_id = axis_id;
23585 axis->specializationConstants.axis_upload_id = axis_upload_id;
23586
23587 for (uint64_t i = 0; i < 3; i++) {
23588 axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding;
23589 axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on)
23590 axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i];
23591 axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i];
23592 }
23593 if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->numAxisUploads[axis_id] == 1))) {
23594 axis->specializationConstants.zeropadBluestein[0] = 1;
23595 axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->configuration.size[axis_id];
23596 if (FFTPlan->multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2;
23597 if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id]-2;
23598 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2;
23599 axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
23600 }
23601 if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) {
23602 axis->specializationConstants.zeropadBluestein[1] = 1;
23603 axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->configuration.size[axis_id];
23604 if (FFTPlan->multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2;
23605 if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2;
23606 if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2;
23607 axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id];
23608 }
23609 if ((inverse)) {
23610 if ((app->configuration.frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)))) {
23611 axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
23612 axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
23613 axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
23614 }
23615 else
23616 axis->specializationConstants.zeropad[0] = 0;
23617 if ((!app->configuration.frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)))) {
23618 axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
23619 axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
23620 axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
23621 }
23622 else
23623 axis->specializationConstants.zeropad[1] = 0;
23624 }
23625 else {
23626 if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) {
23627 axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
23628 axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
23629 axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
23630 }
23631 else
23632 axis->specializationConstants.zeropad[0] = 0;
23633 if (((app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) || (((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)))) {
23634 axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
23635 axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
23636 axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
23637 }
23638 else
23639 axis->specializationConstants.zeropad[1] = 0;
23640 }
23641 if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) {
23642 axis->specializationConstants.convolutionStep = 1;
23643 }
23644 else
23645 axis->specializationConstants.convolutionStep = 0;
23646 if (app->useBluesteinFFT[axis_id] && (axis_upload_id == 0))
23647 axis->specializationConstants.BluesteinConvolutionStep = 1;
23648 else
23649 axis->specializationConstants.BluesteinConvolutionStep = 0;
23650
23651 if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0))
23652 axis->specializationConstants.BluesteinPreMultiplication = 1;
23653 else
23654 axis->specializationConstants.BluesteinPreMultiplication = 0;
23655 if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))
23656 axis->specializationConstants.BluesteinPostMultiplication = 1;
23657 else
23658 axis->specializationConstants.BluesteinPostMultiplication = 0;
23659
23660
23661 uint64_t tempSize[3] = { FFTPlan->actualFFTSizePerAxis[axis_id][0], FFTPlan->actualFFTSizePerAxis[axis_id][1], FFTPlan->actualFFTSizePerAxis[axis_id][2] };
23662
23663
23664 if (axis_id == 0) {
23665 if (axis_upload_id == 0)
23666 tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[1];
23667 else
23668 tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[0];
23669 if ((FFTPlan->actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
23670 tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches;
23671 if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures;
23672 //if (app->configuration.performZeropadding[1]) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
23673 //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0);
23674 if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
23675 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23676 if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
23677 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23678 if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
23679 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23680 }
23681 if (axis_id == 1) {
23682 tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)axis->specializationConstants.fftDim);
23683 tempSize[1] = 1;
23684 tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2];
23685 tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches;
23686 if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures;
23687 //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0);
23688 //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0);
23689
23690 if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
23691 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23692 if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
23693 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23694 if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
23695 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23696
23697 }
23698 if (axis_id == 2) {
23699 tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][2] / (double)axis->specializationConstants.fftDim);
23700 tempSize[1] = 1;
23701 tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][1];
23702 tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches;
23703 if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures;
23704 //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0);
23705
23706 if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
23707 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23708 if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
23709 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23710 if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
23711 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23712
23713 }
23714
23715 char floatTypeInputMemory[10];
23716 char floatTypeOutputMemory[10];
23717 char floatTypeKernelMemory[10];
23718 char floatType[10];
23719 axis->specializationConstants.unroll = 1;
23720 axis->specializationConstants.LUT = app->configuration.useLUT;
23721 if (app->configuration.doublePrecision) {
23722 sprintf(floatType, "double");
23723 sprintf(floatTypeInputMemory, "double");
23724 sprintf(floatTypeOutputMemory, "double");
23725 sprintf(floatTypeKernelMemory, "double");
23726 //axis->specializationConstants.unroll = 1;
23727 }
23728 else {
23729 //axis->specializationConstants.unroll = 0;
23730 if (app->configuration.halfPrecision) {
23731 sprintf(floatType, "float");
23733 //only out of place mode, input/output buffer must be different
23734 sprintf(floatTypeKernelMemory, "float");
23735 if ((axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.actualInverse))
23736 sprintf(floatTypeInputMemory, "half");
23737 else
23738 sprintf(floatTypeInputMemory, "float");
23739 if ((axis_id == app->firstAxis) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.actualInverse))
23740 sprintf(floatTypeOutputMemory, "half");
23741 else
23742 sprintf(floatTypeOutputMemory, "float");
23743 }
23744 else {
23745 sprintf(floatTypeInputMemory, "half");
23746 sprintf(floatTypeOutputMemory, "half");
23747 sprintf(floatTypeKernelMemory, "half");
23748 }
23749
23750 }
23751 else {
23753 sprintf(floatType, "double");
23754 sprintf(floatTypeInputMemory, "float");
23755 sprintf(floatTypeOutputMemory, "float");
23756 sprintf(floatTypeKernelMemory, "float");
23757 }
23758 else {
23759 sprintf(floatType, "float");
23760 sprintf(floatTypeInputMemory, "float");
23761 sprintf(floatTypeOutputMemory, "float");
23762 sprintf(floatTypeKernelMemory, "float");
23763 }
23764 }
23765 }
23766 char uintType[20] = "";
23767 if (!app->configuration.useUint64) {
23768#if(VKFFT_BACKEND==0)
23769 sprintf(uintType, "uint");
23770#elif(VKFFT_BACKEND==1)
23771 sprintf(uintType, "unsigned int");
23772#elif(VKFFT_BACKEND==2)
23773 sprintf(uintType, "unsigned int");
23774#elif(VKFFT_BACKEND==3)
23775 sprintf(uintType, "unsigned int");
23776#endif
23777 }
23778 else {
23779#if(VKFFT_BACKEND==0)
23780 sprintf(uintType, "uint64_t");
23781#elif(VKFFT_BACKEND==1)
23782 sprintf(uintType, "unsigned long long");
23783#elif(VKFFT_BACKEND==2)
23784 sprintf(uintType, "unsigned long long");
23785#elif(VKFFT_BACKEND==3)
23786 sprintf(uintType, "unsigned long");
23787#endif
23788 }
23789
23790 //uint64_t LUT = app->configuration.useLUT;
23791 uint64_t type = 0;
23792 if ((axis_id == 0) && (axis_upload_id == 0)) type = 0;
23793 if (axis_id != 0) type = 1;
23794 if ((axis_id == 0) && (axis_upload_id > 0)) type = 2;
23795 //if ((axis->specializationConstants.fftDim == 8 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 8)) axis->specializationConstants.registerBoost = 8;
23796 if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 5;
23797 if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 6;
23798 if ((axis_id == 0) && (app->configuration.performDCT == 1)) type = 110;
23799 if ((axis_id != 0) && (app->configuration.performDCT == 1)) type = 111;
23800 if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 120;
23801 if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 121;
23802 if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 130;
23803 if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 131;
23804 if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 142;
23805 if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 144;
23806 if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 143;
23807 if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 145;
23808#if(VKFFT_BACKEND==0)
23809 axis->specializationConstants.cacheShuffle = ((FFTPlan->numAxisUploads[axis_id] > 1) && ((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) == 0) && (!app->configuration.doublePrecision) && (!axis->specializationConstants.useBluesteinFFT) && (!app->configuration.doublePrecisionFloatMemory) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0;
23810#elif(VKFFT_BACKEND==1)
23811 axis->specializationConstants.cacheShuffle = 0;
23812#elif(VKFFT_BACKEND==2)
23813 axis->specializationConstants.cacheShuffle = 0;
23814#elif(VKFFT_BACKEND==3)
23815 axis->specializationConstants.cacheShuffle = 0;
23816#endif
23817
23818 axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength;
23819 axis->specializationConstants.maxTempLength = app->configuration.maxTempLength;
23820 axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength);
23821 char* code0 = axis->specializationConstants.code0;
23822 if (!code0) {
23823 deleteVkFFT(app);
23825 }
23826 resFFT = shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
23827 freeShaderGenVkFFT(&axis->specializationConstants);
23828 if (resFFT != VKFFT_SUCCESS) {
23829 deleteVkFFT(app);
23830 return resFFT;
23831 }
23832#if(VKFFT_BACKEND==0)
23833 const glslang_resource_t default_resource = {
23834 /* .MaxLights = */ 32,
23835 /* .MaxClipPlanes = */ 6,
23836 /* .MaxTextureUnits = */ 32,
23837 /* .MaxTextureCoords = */ 32,
23838 /* .MaxVertexAttribs = */ 64,
23839 /* .MaxVertexUniformComponents = */ 4096,
23840 /* .MaxVaryingFloats = */ 64,
23841 /* .MaxVertexTextureImageUnits = */ 32,
23842 /* .MaxCombinedTextureImageUnits = */ 80,
23843 /* .MaxTextureImageUnits = */ 32,
23844 /* .MaxFragmentUniformComponents = */ 4096,
23845 /* .MaxDrawBuffers = */ 32,
23846 /* .MaxVertexUniformVectors = */ 128,
23847 /* .MaxVaryingVectors = */ 8,
23848 /* .MaxFragmentUniformVectors = */ 16,
23849 /* .MaxVertexOutputVectors = */ 16,
23850 /* .MaxFragmentInputVectors = */ 15,
23851 /* .MinProgramTexelOffset = */ -8,
23852 /* .MaxProgramTexelOffset = */ 7,
23853 /* .MaxClipDistances = */ 8,
23854 /* .MaxComputeWorkGroupCountX = */ 65535,
23855 /* .MaxComputeWorkGroupCountY = */ 65535,
23856 /* .MaxComputeWorkGroupCountZ = */ 65535,
23857 /* .MaxComputeWorkGroupSizeX = */ 1024,
23858 /* .MaxComputeWorkGroupSizeY = */ 1024,
23859 /* .MaxComputeWorkGroupSizeZ = */ 64,
23860 /* .MaxComputeUniformComponents = */ 1024,
23861 /* .MaxComputeTextureImageUnits = */ 16,
23862 /* .MaxComputeImageUniforms = */ 8,
23863 /* .MaxComputeAtomicCounters = */ 8,
23864 /* .MaxComputeAtomicCounterBuffers = */ 1,
23865 /* .MaxVaryingComponents = */ 60,
23866 /* .MaxVertexOutputComponents = */ 64,
23867 /* .MaxGeometryInputComponents = */ 64,
23868 /* .MaxGeometryOutputComponents = */ 128,
23869 /* .MaxFragmentInputComponents = */ 128,
23870 /* .MaxImageUnits = */ 8,
23871 /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8,
23872 /* .MaxCombinedShaderOutputResources = */ 8,
23873 /* .MaxImageSamples = */ 0,
23874 /* .MaxVertexImageUniforms = */ 0,
23875 /* .MaxTessControlImageUniforms = */ 0,
23876 /* .MaxTessEvaluationImageUniforms = */ 0,
23877 /* .MaxGeometryImageUniforms = */ 0,
23878 /* .MaxFragmentImageUniforms = */ 8,
23879 /* .MaxCombinedImageUniforms = */ 8,
23880 /* .MaxGeometryTextureImageUnits = */ 16,
23881 /* .MaxGeometryOutputVertices = */ 256,
23882 /* .MaxGeometryTotalOutputComponents = */ 1024,
23883 /* .MaxGeometryUniformComponents = */ 1024,
23884 /* .MaxGeometryVaryingComponents = */ 64,
23885 /* .MaxTessControlInputComponents = */ 128,
23886 /* .MaxTessControlOutputComponents = */ 128,
23887 /* .MaxTessControlTextureImageUnits = */ 16,
23888 /* .MaxTessControlUniformComponents = */ 1024,
23889 /* .MaxTessControlTotalOutputComponents = */ 4096,
23890 /* .MaxTessEvaluationInputComponents = */ 128,
23891 /* .MaxTessEvaluationOutputComponents = */ 128,
23892 /* .MaxTessEvaluationTextureImageUnits = */ 16,
23893 /* .MaxTessEvaluationUniformComponents = */ 1024,
23894 /* .MaxTessPatchComponents = */ 120,
23895 /* .MaxPatchVertices = */ 32,
23896 /* .MaxTessGenLevel = */ 64,
23897 /* .MaxViewports = */ 16,
23898 /* .MaxVertexAtomicCounters = */ 0,
23899 /* .MaxTessControlAtomicCounters = */ 0,
23900 /* .MaxTessEvaluationAtomicCounters = */ 0,
23901 /* .MaxGeometryAtomicCounters = */ 0,
23902 /* .MaxFragmentAtomicCounters = */ 8,
23903 /* .MaxCombinedAtomicCounters = */ 8,
23904 /* .MaxAtomicCounterBindings = */ 1,
23905 /* .MaxVertexAtomicCounterBuffers = */ 0,
23906 /* .MaxTessControlAtomicCounterBuffers = */ 0,
23907 /* .MaxTessEvaluationAtomicCounterBuffers = */ 0,
23908 /* .MaxGeometryAtomicCounterBuffers = */ 0,
23909 /* .MaxFragmentAtomicCounterBuffers = */ 1,
23910 /* .MaxCombinedAtomicCounterBuffers = */ 1,
23911 /* .MaxAtomicCounterBufferSize = */ 16384,
23912 /* .MaxTransformFeedbackBuffers = */ 4,
23913 /* .MaxTransformFeedbackInterleavedComponents = */ 64,
23914 /* .MaxCullDistances = */ 8,
23915 /* .MaxCombinedClipAndCullDistances = */ 8,
23916 /* .MaxSamples = */ 4,
23917 /* .maxMeshOutputVerticesNV = */ 256,
23918 /* .maxMeshOutputPrimitivesNV = */ 512,
23919 /* .maxMeshWorkGroupSizeX_NV = */ 32,
23920 /* .maxMeshWorkGroupSizeY_NV = */ 1,
23921 /* .maxMeshWorkGroupSizeZ_NV = */ 1,
23922 /* .maxTaskWorkGroupSizeX_NV = */ 32,
23923 /* .maxTaskWorkGroupSizeY_NV = */ 1,
23924 /* .maxTaskWorkGroupSizeZ_NV = */ 1,
23925 /* .maxMeshViewCountNV = */ 4,
23926 /* .maxDualSourceDrawBuffersEXT = */ 1,
23927
23928 /* .limits = */ {
23929 /* .nonInductiveForLoops = */ 1,
23930 /* .whileLoops = */ 1,
23931 /* .doWhileLoops = */ 1,
23932 /* .generalUniformIndexing = */ 1,
23933 /* .generalAttributeMatrixVectorIndexing = */ 1,
23934 /* .generalVaryingIndexing = */ 1,
23935 /* .generalSamplerIndexing = */ 1,
23936 /* .generalVariableIndexing = */ 1,
23937 /* .generalConstantMatrixVectorIndexing = */ 1,
23938 } };
23939 glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
23940 glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
23941 const glslang_input_t input =
23942 {
23943 GLSLANG_SOURCE_GLSL,
23944 GLSLANG_STAGE_COMPUTE,
23945 GLSLANG_CLIENT_VULKAN,
23946 client_version,
23947 GLSLANG_TARGET_SPV,
23948 target_language_version,
23949 code0,
23950 450,
23951 GLSLANG_NO_PROFILE,
23952 1,
23953 0,
23954 GLSLANG_MSG_DEFAULT_BIT,
23955 &default_resource,
23956 };
23957 //printf("%s\n", code0);
23958 glslang_shader_t* shader = glslang_shader_create(&input);
23959 const char* err;
23960 if (!glslang_shader_preprocess(shader, &input))
23961 {
23962 err = glslang_shader_get_info_log(shader);
23963 printf("%s\n", code0);
23964 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
23965 glslang_shader_delete(shader);
23966 free(code0);
23967 code0 = 0;
23968 deleteVkFFT(app);
23970
23971 }
23972
23973 if (!glslang_shader_parse(shader, &input))
23974 {
23975 err = glslang_shader_get_info_log(shader);
23976 printf("%s\n", code0);
23977 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
23978 glslang_shader_delete(shader);
23979 free(code0);
23980 code0 = 0;
23981 deleteVkFFT(app);
23983
23984 }
23985 glslang_program_t* program = glslang_program_create();
23986 glslang_program_add_shader(program, shader);
23987 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
23988 {
23989 err = glslang_program_get_info_log(program);
23990 printf("%s\n", code0);
23991 printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type);
23992 glslang_shader_delete(shader);
23993 glslang_program_delete(program);
23994 free(code0);
23995 code0 = 0;
23996 deleteVkFFT(app);
23998
23999 }
24000
24001 //TODO: fix compilation errors
24002 //glslang_program_SPIRV_generate(program, input.stage);
24003
24004 //TODO: fix compilation errors
24005// if (glslang_program_SPIRV_get_messages(program))
24006// {
24007// printf("%s", glslang_program_SPIRV_get_messages(program));
24008// glslang_shader_delete(shader);
24009// glslang_program_delete(program);
24010// free(code0);
24011// code0 = 0;
24012// deleteVkFFT(app);
24013// return VKFFT_ERROR_FAILED_SPIRV_GENERATE;
24014// }
24015
24016 glslang_shader_delete(shader);
24017 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
24018 VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
24019 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
24020 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
24021 //TODO: fix compilation errors
24022// createInfo.pCode = glslang_program_SPIRV_get_ptr(program);
24023// createInfo.codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t);
24024 res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
24025 if (res != VK_SUCCESS) {
24026 glslang_program_delete(program);
24027 free(code0);
24028 code0 = 0;
24029 deleteVkFFT(app);
24031 }
24032 pipelineShaderStageCreateInfo.pName = "main";
24033 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo;
24034 computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
24035 computePipelineCreateInfo.layout = axis->pipelineLayout;
24036 res = vkCreateComputePipelines(app->configuration.device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
24037 if (res != VK_SUCCESS) {
24038 deleteVkFFT(app);
24040 }
24041 vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0);
24042 glslang_program_delete(program);
24043#elif(VKFFT_BACKEND==1)
24044 nvrtcProgram prog;
24045 nvrtcResult result = nvrtcCreateProgram(&prog, // prog
24046 code0, // buffer
24047 "VkFFT.cu", // name
24048 0, // numHeaders
24049 0, // headers
24050 0); // includeNames
24051 //free(includeNames);
24052 //free(headers);
24053 if (result != NVRTC_SUCCESS) {
24054 printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
24055 free(code0);
24056 code0 = 0;
24057 deleteVkFFT(app);
24059 }
24060 //const char opts[20] = "--fmad=false";
24061 //result = nvrtcAddNameExpression(prog, "&consts");
24062 //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result));
24063 result = nvrtcCompileProgram(prog, // prog
24064 0, // numOptions
24065 0); // options
24066 if (result != NVRTC_SUCCESS) {
24067 printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
24068 char* log = (char*)malloc(sizeof(char) * 1000000);
24069 if (!log) {
24070 free(code0);
24071 code0 = 0;
24072 deleteVkFFT(app);
24074 }
24075 else {
24076 nvrtcGetProgramLog(prog, log);
24077 printf("%s\n", log);
24078 free(log);
24079 log = 0;
24080 printf("%s\n", code0);
24081 free(code0);
24082 code0 = 0;
24083 deleteVkFFT(app);
24085 }
24086 }
24087 size_t ptxSize;
24088 result = nvrtcGetPTXSize(prog, &ptxSize);
24089 if (result != NVRTC_SUCCESS) {
24090 printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
24091 free(code0);
24092 code0 = 0;
24093 deleteVkFFT(app);
24095 }
24096 char* ptx = (char*)malloc(ptxSize);
24097 if (!ptx) {
24098 free(code0);
24099 code0 = 0;
24100 deleteVkFFT(app);
24102 }
24103 result = nvrtcGetPTX(prog, ptx);
24104 if (result != NVRTC_SUCCESS) {
24105 printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
24106 free(ptx);
24107 ptx = 0;
24108 free(code0);
24109 code0 = 0;
24110 deleteVkFFT(app);
24112 }
24113 result = nvrtcDestroyProgram(&prog);
24114 if (result != NVRTC_SUCCESS) {
24115 printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
24116 free(ptx);
24117 ptx = 0;
24118 free(code0);
24119 code0 = 0;
24120 deleteVkFFT(app);
24122 }
24123
24124 CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
24125
24126 if (result2 != CUDA_SUCCESS) {
24127 printf("cuModuleLoadDataEx error: %d\n", result2);
24128 free(ptx);
24129 ptx = 0;
24130 free(code0);
24131 code0 = 0;
24132 deleteVkFFT(app);
24134 }
24135 result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main");
24136 if (result2 != CUDA_SUCCESS) {
24137 printf("cuModuleGetFunction error: %d\n", result2);
24138 free(ptx);
24139 ptx = 0;
24140 free(code0);
24141 code0 = 0;
24142 deleteVkFFT(app);
24144 }
24145 if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
24146 result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory);
24147 if (result2 != CUDA_SUCCESS) {
24148 printf("cuFuncSetAttribute error: %d\n", result2);
24149 free(ptx);
24150 ptx = 0;
24151 free(code0);
24152 code0 = 0;
24153 deleteVkFFT(app);
24155 }
24156 }
24158 result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
24159 if (result2 != CUDA_SUCCESS) {
24160 printf("cuModuleGetGlobal error: %d\n", result2);
24161 free(ptx);
24162 ptx = 0;
24163 free(code0);
24164 code0 = 0;
24165 deleteVkFFT(app);
24167 }
24168 free(ptx);
24169 ptx = 0;
24170#elif(VKFFT_BACKEND==2)
24171 hiprtcProgram prog;
24172 /*char* includeNames = (char*)malloc(sizeof(char)*100);
24173 char* headers = (char*)malloc(sizeof(char) * 100);
24174 sprintf(headers, "C://Program Files//NVIDIA GPU Computing Toolkit//CUDA//v11.1//include//cuComplex.h");
24175 sprintf(includeNames, "cuComplex.h");*/
24176 enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog
24177 code0, // buffer
24178 "VkFFT.hip", // name
24179 0, // numHeaders
24180 0, // headers
24181 0); // includeNames
24182 if (result != HIPRTC_SUCCESS) {
24183 printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
24184 free(code0);
24185 code0 = 0;
24186 deleteVkFFT(app);
24188 }
24189
24190 result = hiprtcAddNameExpression(prog, "&consts");
24191 if (result != HIPRTC_SUCCESS) {
24192 printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
24193 free(code0);
24194 code0 = 0;
24195 deleteVkFFT(app);
24197 }
24198
24199 result = hiprtcCompileProgram(prog, // prog
24200 0, // numOptions
24201 0); // options
24202 if (result != HIPRTC_SUCCESS) {
24203 printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
24204 char* log = (char*)malloc(sizeof(char) * 100000);
24205 if (!log) {
24206 free(code0);
24207 code0 = 0;
24208 deleteVkFFT(app);
24210 }
24211 else {
24212 hiprtcGetProgramLog(prog, log);
24213 printf("%s\n", log);
24214 free(log);
24215 log = 0;
24216 printf("%s\n", code0);
24217 free(code0);
24218 code0 = 0;
24219 deleteVkFFT(app);
24221 }
24222 }
24223 size_t codeSize;
24224 result = hiprtcGetCodeSize(prog, &codeSize);
24225 if (result != HIPRTC_SUCCESS) {
24226 printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
24227 free(code0);
24228 code0 = 0;
24229 deleteVkFFT(app);
24231 }
24232 char* code = (char*)malloc(codeSize);
24233 if (!code) {
24234 free(code0);
24235 code0 = 0;
24236 deleteVkFFT(app);
24238 }
24239 result = hiprtcGetCode(prog, code);
24240 if (result != HIPRTC_SUCCESS) {
24241 printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
24242 free(code);
24243 code = 0;
24244 free(code0);
24245 code0 = 0;
24246 deleteVkFFT(app);
24248 }
24249 //printf("%s\n", code);
24250 // Destroy the program.
24251 result = hiprtcDestroyProgram(&prog);
24252 if (result != HIPRTC_SUCCESS) {
24253 printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
24254 free(code);
24255 code = 0;
24256 free(code0);
24257 code0 = 0;
24258 deleteVkFFT(app);
24260 }
24261 hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
24262
24263 if (result2 != hipSuccess) {
24264 printf("hipModuleLoadDataEx error: %d\n", result2);
24265 free(code);
24266 code = 0;
24267 free(code0);
24268 code0 = 0;
24269 deleteVkFFT(app);
24271 }
24272 result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main");
24273 if (result2 != hipSuccess) {
24274 printf("hipModuleGetFunction error: %d\n", result2);
24275 free(code);
24276 code = 0;
24277 free(code0);
24278 code0 = 0;
24279 deleteVkFFT(app);
24281 }
24282 if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
24283 result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory);
24284 //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared);
24285 if (result2 != hipSuccess) {
24286 printf("hipFuncSetAttribute error: %d\n", result2);
24287 free(code);
24288 code = 0;
24289 free(code0);
24290 code0 = 0;
24291 deleteVkFFT(app);
24293 }
24294 }
24296 result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
24297 if (result2 != hipSuccess) {
24298 printf("hipModuleGetGlobal error: %d\n", result2);
24299 free(code);
24300 code = 0;
24301 free(code0);
24302 code0 = 0;
24303 deleteVkFFT(app);
24305 }
24306
24307 free(code);
24308 code = 0;
24309#elif(VKFFT_BACKEND==3)
24310 size_t codelen = strlen(code0);
24311 axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&code0, &codelen, &res);
24312 if (res != CL_SUCCESS) {
24313 free(code0);
24314 code0 = 0;
24315 deleteVkFFT(app);
24317 }
24318 res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0);
24319 if (res != CL_SUCCESS) {
24320 size_t log_size;
24321 clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
24322 char* log = (char*)malloc(log_size);
24323 if (!log) {
24324 free(code0);
24325 code0 = 0;
24326 deleteVkFFT(app);
24328 }
24329 else {
24330 clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
24331 printf("%s\n", log);
24332 free(log);
24333 log = 0;
24334 printf("%s\n", code0);
24335 free(code0);
24336 code0 = 0;
24337 deleteVkFFT(app);
24339 }
24340 }
24341 axis->kernel = clCreateKernel(axis->program, "VkFFT_main", &res);
24342 if (res != CL_SUCCESS) {
24343 free(code0);
24344 code0 = 0;
24345 deleteVkFFT(app);
24347 }
24348#endif
24349 if (!app->configuration.keepShaderCode) {
24350 free(code0);
24351 code0 = 0;
24352 axis->specializationConstants.code0 = 0;
24353 }
24354 }
24355 if (axis->specializationConstants.axisSwapped) {//swap back for correct dispatch
24356 uint64_t temp = axis->axisBlock[1];
24357 axis->axisBlock[1] = axis->axisBlock[0];
24358 axis->axisBlock[0] = temp;
24359 axis->specializationConstants.axisSwapped = 0;
24360 }
24361 return resFFT;
24362}
24363static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration) {
24364 //app->configuration = {};// inputLaunchConfiguration;
24365 if (inputLaunchConfiguration.doublePrecision != 0) app->configuration.doublePrecision = inputLaunchConfiguration.doublePrecision;
24366 if (inputLaunchConfiguration.doublePrecisionFloatMemory != 0) app->configuration.doublePrecisionFloatMemory = inputLaunchConfiguration.doublePrecisionFloatMemory;
24367 if (inputLaunchConfiguration.halfPrecision != 0) app->configuration.halfPrecision = inputLaunchConfiguration.halfPrecision;
24368 if (inputLaunchConfiguration.halfPrecisionMemoryOnly != 0) app->configuration.halfPrecisionMemoryOnly = inputLaunchConfiguration.halfPrecisionMemoryOnly;
24369 //set device parameters
24370#if(VKFFT_BACKEND==0)
24371 if (!inputLaunchConfiguration.isCompilerInitialized) {
24373 int resGlslangInitialize = glslang_initialize_process();
24374 if (resGlslangInitialize) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
24376 }
24377 }
24378 if (inputLaunchConfiguration.physicalDevice == 0) {
24379 deleteVkFFT(app);
24381 }
24382 app->configuration.physicalDevice = inputLaunchConfiguration.physicalDevice;
24383 if (inputLaunchConfiguration.device == 0) {
24384 deleteVkFFT(app);
24386 }
24387 app->configuration.device = inputLaunchConfiguration.device;
24388 if (inputLaunchConfiguration.queue == 0) {
24389 deleteVkFFT(app);
24391 }
24392 app->configuration.queue = inputLaunchConfiguration.queue;
24393 if (inputLaunchConfiguration.commandPool == 0) {
24394 deleteVkFFT(app);
24396 }
24397 app->configuration.commandPool = inputLaunchConfiguration.commandPool;
24398 if (inputLaunchConfiguration.fence == 0) {
24399 deleteVkFFT(app);
24401 }
24402 app->configuration.fence = inputLaunchConfiguration.fence;
24403
24404 VkPhysicalDeviceProperties physicalDeviceProperties = { 0 };
24405 vkGetPhysicalDeviceProperties(app->configuration.physicalDevice[0], &physicalDeviceProperties);
24406 app->configuration.maxThreadsNum = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
24407 if (physicalDeviceProperties.vendorID == 0x8086) app->configuration.maxThreadsNum = 256; //Intel fix
24408 app->configuration.maxComputeWorkGroupCount[0] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
24409 app->configuration.maxComputeWorkGroupCount[1] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
24410 app->configuration.maxComputeWorkGroupCount[2] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
24411 app->configuration.maxComputeWorkGroupSize[0] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
24412 app->configuration.maxComputeWorkGroupSize[1] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
24413 app->configuration.maxComputeWorkGroupSize[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
24414 //if ((physicalDeviceProperties.vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1;
24415 app->configuration.sharedMemorySize = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
24416 app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(physicalDeviceProperties.limits.maxComputeSharedMemorySize));
24417 switch (physicalDeviceProperties.vendorID) {
24418 case 0x10DE://NVIDIA
24419 app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
24421 app->configuration.warpSize = 32;
24426 break;
24427 case 0x8086://INTEL
24428 case 0x13BE://ARM
24429 case 0x5143://Qualcomm
24431 app->configuration.useLUT = 1;
24432 app->configuration.warpSize = 32;
24434 app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 1 : 2;
24437 break;
24438 case 0x1002://AMD
24441 app->configuration.warpSize = 64;
24443 app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 2 : 4;
24446 break;
24447 default:
24450 app->configuration.warpSize = 32;
24455 break;
24456 }
24457#elif(VKFFT_BACKEND==1)
24458 CUresult res = CUDA_SUCCESS;
24459 cudaError_t res_t = cudaSuccess;
24460 if (inputLaunchConfiguration.device == 0) {
24461 deleteVkFFT(app);
24463 }
24464 app->configuration.device = inputLaunchConfiguration.device;
24465 if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams;
24466 if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream;
24467 app->configuration.streamID = 0;
24468 int value = 0;
24469 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->configuration.device[0]);
24470 if (res != CUDA_SUCCESS) {
24471 deleteVkFFT(app);
24473 }
24474 app->configuration.maxThreadsNum = value;
24475 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->configuration.device[0]);
24476 if (res != CUDA_SUCCESS) {
24477 deleteVkFFT(app);
24479 }
24481 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->configuration.device[0]);
24482 if (res != CUDA_SUCCESS) {
24483 deleteVkFFT(app);
24485 }
24487 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->configuration.device[0]);
24488 if (res != CUDA_SUCCESS) {
24489 deleteVkFFT(app);
24491 }
24493 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->configuration.device[0]);
24494 if (res != CUDA_SUCCESS) {
24495 deleteVkFFT(app);
24497 }
24498 app->configuration.maxComputeWorkGroupSize[0] = value;
24499 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->configuration.device[0]);
24500 if (res != CUDA_SUCCESS) {
24501 deleteVkFFT(app);
24503 }
24504 app->configuration.maxComputeWorkGroupSize[1] = value;
24505 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->configuration.device[0]);
24506 if (res != CUDA_SUCCESS) {
24507 deleteVkFFT(app);
24509 }
24510 app->configuration.maxComputeWorkGroupSize[2] = value;
24511 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->configuration.device[0]);
24512 if (res != CUDA_SUCCESS) {
24513 deleteVkFFT(app);
24515 }
24517 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->configuration.device[0]);
24518 if (res != CUDA_SUCCESS) {
24519 deleteVkFFT(app);
24521 }
24522 app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value;
24523 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->configuration.device[0]);
24524 if (res != CUDA_SUCCESS) {
24525 deleteVkFFT(app);
24527 }
24528 app->configuration.warpSize = value;
24529 app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
24530 if (app->configuration.num_streams > 1) {
24531 app->configuration.stream_event = (cudaEvent_t*)malloc(app->configuration.num_streams * sizeof(cudaEvent_t));
24532 if (!app->configuration.stream_event) {
24533 deleteVkFFT(app);
24535 }
24536 for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
24537 res_t = cudaEventCreate(&app->configuration.stream_event[i]);
24538 if (res != CUDA_SUCCESS) {
24539 deleteVkFFT(app);
24541 }
24542 }
24543 }
24544
24545 app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
24551#elif(VKFFT_BACKEND==2)
24552 hipError_t res = hipSuccess;
24553 if (inputLaunchConfiguration.device == 0) {
24554 deleteVkFFT(app);
24556 }
24557 app->configuration.device = inputLaunchConfiguration.device;
24558 if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams;
24559 if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream;
24560 app->configuration.streamID = 0;
24561 int value = 0;
24562 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->configuration.device[0]);
24563 if (res != hipSuccess) {
24564 deleteVkFFT(app);
24566 }
24567 app->configuration.maxThreadsNum = value;
24568 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->configuration.device[0]);
24569 if (res != hipSuccess) {
24570 deleteVkFFT(app);
24572 }
24574 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->configuration.device[0]);
24575 if (res != hipSuccess) {
24576 deleteVkFFT(app);
24578 }
24580 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->configuration.device[0]);
24581 if (res != hipSuccess) {
24582 deleteVkFFT(app);
24584 }
24586 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->configuration.device[0]);
24587 if (res != hipSuccess) {
24588 deleteVkFFT(app);
24590 }
24591 app->configuration.maxComputeWorkGroupSize[0] = value;
24592 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->configuration.device[0]);
24593 if (res != hipSuccess) {
24594 deleteVkFFT(app);
24596 }
24597 app->configuration.maxComputeWorkGroupSize[1] = value;
24598 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->configuration.device[0]);
24599 if (res != hipSuccess) {
24600 deleteVkFFT(app);
24602 }
24603 app->configuration.maxComputeWorkGroupSize[2] = value;
24604 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->configuration.device[0]);
24605 if (res != hipSuccess) {
24606 deleteVkFFT(app);
24608 }
24610 //hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlockOptin, app->configuration.device[0]);
24611 app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value;
24612 res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->configuration.device[0]);
24613 if (res != hipSuccess) {
24614 deleteVkFFT(app);
24616 }
24617 app->configuration.warpSize = value;
24618 app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
24619 if (app->configuration.num_streams > 1) {
24620 app->configuration.stream_event = (hipEvent_t*)malloc(app->configuration.num_streams * sizeof(hipEvent_t));
24621 if (!app->configuration.stream_event) {
24622 deleteVkFFT(app);
24624 }
24625 for (uint64_t i = 0; i < app->configuration.num_streams; i++) {
24626 res = hipEventCreate(&app->configuration.stream_event[i]);
24627 if (res != hipSuccess) {
24628 deleteVkFFT(app);
24630 }
24631 }
24632 }
24639#elif(VKFFT_BACKEND==3)
24640 cl_int res = 0;
24641 if (inputLaunchConfiguration.device == 0) {
24642 deleteVkFFT(app);
24644 }
24645 app->configuration.device = inputLaunchConfiguration.device;
24646 if (inputLaunchConfiguration.context == 0) {
24647 deleteVkFFT(app);
24649 }
24650 app->configuration.context = inputLaunchConfiguration.context;
24651 if (inputLaunchConfiguration.platform == 0) {
24652 deleteVkFFT(app);
24654 }
24655 app->configuration.platform = inputLaunchConfiguration.platform;
24656 cl_uint vendorID;
24657 size_t value_int64;
24658 cl_uint value_cl_uint;
24659 res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0);
24660 if (res != 0) {
24661 deleteVkFFT(app);
24663 }
24664 res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value_int64, 0);
24665 if (res != 0) {
24666 deleteVkFFT(app);
24668 }
24669 app->configuration.maxThreadsNum = value_int64;
24670 res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &value_cl_uint, 0);
24671 if (res != 0) {
24672 deleteVkFFT(app);
24674 }
24675 size_t* dims = (size_t*)malloc(sizeof(size_t) * value_cl_uint);
24676 if (dims) {
24677 res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * value_cl_uint, dims, 0);
24678 if (res != 0) {
24679 deleteVkFFT(app);
24681 }
24682 app->configuration.maxComputeWorkGroupSize[0] = dims[0];
24683 app->configuration.maxComputeWorkGroupSize[1] = dims[1];
24684 app->configuration.maxComputeWorkGroupSize[2] = dims[2];
24685 free(dims);
24686 dims = 0;
24687 }
24688 else {
24689 deleteVkFFT(app);
24691 }
24692 app->configuration.maxComputeWorkGroupCount[0] = UINT64_MAX;
24693 app->configuration.maxComputeWorkGroupCount[1] = UINT64_MAX;
24694 app->configuration.maxComputeWorkGroupCount[2] = UINT64_MAX;
24695 //if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1;
24696 cl_ulong sharedMemorySize;
24697 res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &sharedMemorySize, 0);
24698 if (res != 0) {
24699 deleteVkFFT(app);
24701 }
24702 app->configuration.sharedMemorySize = sharedMemorySize;
24703 app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(sharedMemorySize));
24704 switch (vendorID) {
24705 case 0x10DE://NVIDIA
24706 app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM.
24708 app->configuration.warpSize = 32;
24713 app->configuration.sharedMemorySize -= 0x10;//reserved by system
24714 app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize));
24715 break;
24716 case 0x8086://INTEL
24718 app->configuration.useLUT = 1;
24719 app->configuration.warpSize = 32;
24721 app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 1 : 2;
24724 break;
24725 case 0x1002://AMD
24728 app->configuration.warpSize = 64;
24730 app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 2 : 4;
24733 break;
24734 default:
24737 app->configuration.warpSize = 32;
24742 break;
24743 }
24744#endif
24745 //set main parameters:
24746 if (inputLaunchConfiguration.FFTdim == 0) {
24747 deleteVkFFT(app);
24749 }
24750 app->configuration.FFTdim = inputLaunchConfiguration.FFTdim;
24751 if (inputLaunchConfiguration.size[0] == 0) {
24752 deleteVkFFT(app);
24754 }
24755
24756 app->configuration.size[0] = inputLaunchConfiguration.size[0];
24757
24758 if (inputLaunchConfiguration.bufferStride[0] == 0) {
24759 if (inputLaunchConfiguration.performR2C)
24760 app->configuration.bufferStride[0] = app->configuration.size[0] / 2 + 1;
24761 else
24762 app->configuration.bufferStride[0] = app->configuration.size[0];
24763 }
24764 else
24765 app->configuration.bufferStride[0] = inputLaunchConfiguration.bufferStride[0];
24766
24767 if (inputLaunchConfiguration.inputBufferStride[0] == 0) {
24768 if (inputLaunchConfiguration.performR2C)
24769 app->configuration.inputBufferStride[0] = app->configuration.size[0] + 2;
24770 else
24772 }
24773 else
24774 app->configuration.inputBufferStride[0] = inputLaunchConfiguration.inputBufferStride[0];
24775
24776 if (inputLaunchConfiguration.outputBufferStride[0] == 0) {
24777 if (inputLaunchConfiguration.performR2C)
24778 app->configuration.outputBufferStride[0] = app->configuration.size[0] + 2;
24779 else
24781 }
24782 else
24783 app->configuration.outputBufferStride[0] = inputLaunchConfiguration.outputBufferStride[0];
24784 for (uint64_t i = 1; i < 3; i++) {
24785 if (inputLaunchConfiguration.size[i] == 0)
24786 app->configuration.size[i] = 1;
24787 else
24788 app->configuration.size[i] = inputLaunchConfiguration.size[i];
24789
24790 if (inputLaunchConfiguration.bufferStride[i] == 0)
24792 else
24793 app->configuration.bufferStride[i] = inputLaunchConfiguration.bufferStride[i];
24794
24795 if (inputLaunchConfiguration.inputBufferStride[i] == 0)
24797 else
24798 app->configuration.inputBufferStride[i] = inputLaunchConfiguration.inputBufferStride[i];
24799
24800 if (inputLaunchConfiguration.outputBufferStride[i] == 0)
24802 else
24803 app->configuration.outputBufferStride[i] = inputLaunchConfiguration.outputBufferStride[i];
24804 }
24805
24806 app->configuration.isInputFormatted = inputLaunchConfiguration.isInputFormatted;
24807 app->configuration.isOutputFormatted = inputLaunchConfiguration.isOutputFormatted;
24808 app->configuration.performConvolution = inputLaunchConfiguration.performConvolution;
24809
24810 if (inputLaunchConfiguration.bufferNum == 0) app->configuration.bufferNum = 1;
24811 else app->configuration.bufferNum = inputLaunchConfiguration.bufferNum;
24812#if(VKFFT_BACKEND==0)
24813 if (inputLaunchConfiguration.bufferSize == 0) {
24814 deleteVkFFT(app);
24816 }
24817#endif
24818 app->configuration.bufferSize = inputLaunchConfiguration.bufferSize;
24819 if (app->configuration.bufferSize != 0) {
24820 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
24821 if (app->configuration.bufferSize[i] == 0) {
24822 deleteVkFFT(app);
24824 }
24825 }
24826 }
24827 app->configuration.buffer = inputLaunchConfiguration.buffer;
24828
24829 if (inputLaunchConfiguration.userTempBuffer != 0) app->configuration.userTempBuffer = inputLaunchConfiguration.userTempBuffer;
24830
24831 if (app->configuration.userTempBuffer != 0) {
24832 if (inputLaunchConfiguration.tempBufferNum == 0) app->configuration.tempBufferNum = 1;
24833 else app->configuration.tempBufferNum = inputLaunchConfiguration.tempBufferNum;
24834#if(VKFFT_BACKEND==0)
24835 if (inputLaunchConfiguration.tempBufferSize == 0) {
24836 deleteVkFFT(app);
24838 }
24839#endif
24840 app->configuration.tempBufferSize = inputLaunchConfiguration.tempBufferSize;
24841 if (app->configuration.tempBufferSize != 0) {
24842 for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) {
24843 if (app->configuration.tempBufferSize[i] == 0) {
24844 deleteVkFFT(app);
24846 }
24847 }
24848 }
24849 app->configuration.tempBuffer = inputLaunchConfiguration.tempBuffer;
24850 }
24851 else {
24853 app->configuration.tempBufferSize = (uint64_t*)malloc(sizeof(uint64_t));
24854 if (!app->configuration.tempBufferSize) {
24855 deleteVkFFT(app);
24857 }
24858 app->configuration.tempBufferSize[0] = 0;
24859
24860 }
24861
24863 if (inputLaunchConfiguration.inputBufferNum == 0) app->configuration.inputBufferNum = 1;
24864 else app->configuration.inputBufferNum = inputLaunchConfiguration.inputBufferNum;
24865#if(VKFFT_BACKEND==0)
24866 if (inputLaunchConfiguration.inputBufferSize == 0) {
24867 deleteVkFFT(app);
24869 }
24870#endif
24871 app->configuration.inputBufferSize = inputLaunchConfiguration.inputBufferSize;
24872 if (app->configuration.inputBufferSize != 0) {
24873 for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
24874 if (app->configuration.inputBufferSize[i] == 0) {
24875 deleteVkFFT(app);
24877 }
24878 }
24879 }
24880 app->configuration.inputBuffer = inputLaunchConfiguration.inputBuffer;
24881 }
24882 else {
24884
24887 }
24889 if (inputLaunchConfiguration.outputBufferNum == 0) app->configuration.outputBufferNum = 1;
24890 else
24891 app->configuration.outputBufferNum = inputLaunchConfiguration.outputBufferNum;
24892#if(VKFFT_BACKEND==0)
24893 if (inputLaunchConfiguration.outputBufferSize == 0) {
24894 deleteVkFFT(app);
24896 }
24897#endif
24898 app->configuration.outputBufferSize = inputLaunchConfiguration.outputBufferSize;
24899 if (app->configuration.outputBufferSize != 0) {
24900 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
24901 if (app->configuration.outputBufferSize[i] == 0) {
24902 deleteVkFFT(app);
24904 }
24905 }
24906 }
24907 app->configuration.outputBuffer = inputLaunchConfiguration.outputBuffer;
24908 }
24909 else {
24911
24914 }
24916 if (inputLaunchConfiguration.kernelNum == 0) app->configuration.kernelNum = 1;
24917 else app->configuration.kernelNum = inputLaunchConfiguration.kernelNum;
24918#if(VKFFT_BACKEND==0)
24919 if (inputLaunchConfiguration.kernelSize == 0) {
24920 deleteVkFFT(app);
24922 }
24923#endif
24924 app->configuration.kernelSize = inputLaunchConfiguration.kernelSize;
24925 if (app->configuration.kernelSize != 0) {
24926 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
24927 if (app->configuration.kernelSize[i] == 0) {
24928 deleteVkFFT(app);
24930 }
24931 }
24932 }
24933 app->configuration.kernel = inputLaunchConfiguration.kernel;
24934 }
24935
24936 if (inputLaunchConfiguration.bufferOffset != 0) app->configuration.bufferOffset = inputLaunchConfiguration.bufferOffset;
24937 if (inputLaunchConfiguration.tempBufferOffset != 0) app->configuration.tempBufferOffset = inputLaunchConfiguration.tempBufferOffset;
24938 if (inputLaunchConfiguration.inputBufferOffset != 0) app->configuration.inputBufferOffset = inputLaunchConfiguration.inputBufferOffset;
24939 if (inputLaunchConfiguration.outputBufferOffset != 0) app->configuration.outputBufferOffset = inputLaunchConfiguration.outputBufferOffset;
24940 if (inputLaunchConfiguration.kernelOffset != 0) app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset;
24941
24942 //set optional parameters:
24943 uint64_t checkBufferSizeFor64BitAddressing = 0;
24944 for (uint64_t i = 0; i < app->configuration.bufferNum; i++) {
24945 if (app->configuration.bufferSize)
24946 checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i];
24947 else {
24948 checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * app->configuration.coordinateFeatures * app->configuration.numberBatches * app->configuration.numberKernels * 8;
24949 if (app->configuration.doublePrecision) checkBufferSizeFor64BitAddressing *= 2;
24950 }
24951 }
24952 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
24953 checkBufferSizeFor64BitAddressing = 0;
24954 for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) {
24956 checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i];
24957 }
24958 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
24959
24960 checkBufferSizeFor64BitAddressing = 0;
24961 for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) {
24963 checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i];
24964 }
24965 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
24966
24967 checkBufferSizeFor64BitAddressing = 0;
24968 for (uint64_t i = 0; i < app->configuration.kernelNum; i++) {
24969 if (app->configuration.kernelSize)
24970 checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i];
24971 }
24972 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1;
24973 if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64;
24974
24975 if (inputLaunchConfiguration.coalescedMemory != 0) app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory;
24976 app->configuration.aimThreads = 128;
24977 if (inputLaunchConfiguration.aimThreads != 0) app->configuration.aimThreads = inputLaunchConfiguration.aimThreads;
24978 app->configuration.numSharedBanks = 32;
24979 if (inputLaunchConfiguration.numSharedBanks != 0) app->configuration.numSharedBanks = inputLaunchConfiguration.numSharedBanks;
24980 if (inputLaunchConfiguration.inverseReturnToInputBuffer != 0) app->configuration.inverseReturnToInputBuffer = inputLaunchConfiguration.inverseReturnToInputBuffer;
24981
24982 if (inputLaunchConfiguration.useLUT != 0) app->configuration.useLUT = inputLaunchConfiguration.useLUT;
24983 if (inputLaunchConfiguration.fixMaxRadixBluestein != 0) app->configuration.fixMaxRadixBluestein = inputLaunchConfiguration.fixMaxRadixBluestein;
24984
24985 if (inputLaunchConfiguration.performR2C != 0) {
24986 app->configuration.performR2C = inputLaunchConfiguration.performR2C;
24987 }
24988 if (inputLaunchConfiguration.performDCT != 0) {
24989 app->configuration.performDCT = inputLaunchConfiguration.performDCT;
24990 }
24991 if (inputLaunchConfiguration.disableMergeSequencesR2C != 0) {
24992 app->configuration.disableMergeSequencesR2C = inputLaunchConfiguration.disableMergeSequencesR2C;
24993 }
24994
24995 app->configuration.normalize = 0;
24996 if (inputLaunchConfiguration.normalize != 0) app->configuration.normalize = inputLaunchConfiguration.normalize;
24997 if (inputLaunchConfiguration.makeForwardPlanOnly != 0) app->configuration.makeForwardPlanOnly = inputLaunchConfiguration.makeForwardPlanOnly;
24998 if (inputLaunchConfiguration.makeInversePlanOnly != 0) app->configuration.makeInversePlanOnly = inputLaunchConfiguration.makeInversePlanOnly;
24999
25001 if (inputLaunchConfiguration.disableReorderFourStep != 0) app->configuration.reorderFourStep = 0;
25002 if (inputLaunchConfiguration.frequencyZeroPadding != 0) app->configuration.frequencyZeroPadding = inputLaunchConfiguration.frequencyZeroPadding;
25003 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25004 if (inputLaunchConfiguration.performZeropadding[i] != 0) {
25005 app->configuration.performZeropadding[i] = inputLaunchConfiguration.performZeropadding[i];
25006 app->configuration.fft_zeropad_left[i] = inputLaunchConfiguration.fft_zeropad_left[i];
25007 app->configuration.fft_zeropad_right[i] = inputLaunchConfiguration.fft_zeropad_right[i];
25008 }
25009 }
25010 if (inputLaunchConfiguration.registerBoost != 0) app->configuration.registerBoost = inputLaunchConfiguration.registerBoost;
25011 if (inputLaunchConfiguration.registerBoostNonPow2 != 0) app->configuration.registerBoostNonPow2 = inputLaunchConfiguration.registerBoostNonPow2;
25012 if (inputLaunchConfiguration.registerBoost4Step != 0) app->configuration.registerBoost4Step = inputLaunchConfiguration.registerBoost4Step;
25013
25014 if (app->configuration.performR2C != 0) {
25018 }
25019
25022 if (inputLaunchConfiguration.coordinateFeatures != 0) app->configuration.coordinateFeatures = inputLaunchConfiguration.coordinateFeatures;
25023 if (inputLaunchConfiguration.numberBatches != 0) app->configuration.numberBatches = inputLaunchConfiguration.numberBatches;
25024
25027 if (inputLaunchConfiguration.kernelConvolution != 0) {
25028 app->configuration.kernelConvolution = inputLaunchConfiguration.kernelConvolution;
25033 }
25034
25036
25037 if (inputLaunchConfiguration.matrixConvolution != 0) app->configuration.matrixConvolution = inputLaunchConfiguration.matrixConvolution;
25038 if (inputLaunchConfiguration.numberKernels != 0) app->configuration.numberKernels = inputLaunchConfiguration.numberKernels;
25039
25040 if (inputLaunchConfiguration.symmetricKernel != 0) app->configuration.symmetricKernel = inputLaunchConfiguration.symmetricKernel;
25041 if (inputLaunchConfiguration.conjugateConvolution != 0) app->configuration.conjugateConvolution = inputLaunchConfiguration.conjugateConvolution;
25042 if (inputLaunchConfiguration.crossPowerSpectrumNormalization != 0) app->configuration.crossPowerSpectrumNormalization = inputLaunchConfiguration.crossPowerSpectrumNormalization;
25043
25049 }
25050 app->firstAxis = 0;
25051 app->lastAxis = app->configuration.FFTdim - 1;
25052 if (inputLaunchConfiguration.omitDimension[0] != 0) {
25053 app->configuration.omitDimension[0] = inputLaunchConfiguration.omitDimension[0];
25054 app->firstAxis++;
25056 deleteVkFFT(app);
25058 }
25059 if (app->configuration.performR2C) {
25061 deleteVkFFT(app);
25062 }
25063 }
25064 if (inputLaunchConfiguration.omitDimension[2] != 0) {
25065 app->configuration.omitDimension[2] = inputLaunchConfiguration.omitDimension[2];
25066 app->lastAxis--;
25068 deleteVkFFT(app);
25070 }
25071 }
25072 if (inputLaunchConfiguration.omitDimension[1] != 0) {
25073 app->configuration.omitDimension[1] = inputLaunchConfiguration.omitDimension[1];
25074 if (app->configuration.omitDimension[0] == 1) app->firstAxis++;
25075 if (app->configuration.omitDimension[2] == 1) app->lastAxis--;
25077 deleteVkFFT(app);
25079 }
25080 }
25081 if (app->firstAxis > app->lastAxis) {
25083 deleteVkFFT(app);
25084 }
25085 if (inputLaunchConfiguration.reorderFourStep != 0) app->configuration.reorderFourStep = inputLaunchConfiguration.reorderFourStep;
25086 app->configuration.maxCodeLength = 1000000;
25087 if (inputLaunchConfiguration.maxCodeLength != 0) app->configuration.maxCodeLength = inputLaunchConfiguration.maxCodeLength;
25088 app->configuration.maxTempLength = 5000;
25089 if (inputLaunchConfiguration.maxTempLength != 0) app->configuration.maxTempLength = inputLaunchConfiguration.maxTempLength;
25090
25091 if (inputLaunchConfiguration.halfThreads != 0) app->configuration.halfThreads = inputLaunchConfiguration.halfThreads;
25092 if (inputLaunchConfiguration.swapTo3Stage4Step != 0) app->configuration.swapTo3Stage4Step = inputLaunchConfiguration.swapTo3Stage4Step;
25094 if (inputLaunchConfiguration.performBandwidthBoost != 0) app->configuration.performBandwidthBoost = inputLaunchConfiguration.performBandwidthBoost;
25095 if (inputLaunchConfiguration.devicePageSize != 0) app->configuration.devicePageSize = inputLaunchConfiguration.devicePageSize;
25096 if (inputLaunchConfiguration.localPageSize != 0) app->configuration.localPageSize = inputLaunchConfiguration.localPageSize;
25097 if (inputLaunchConfiguration.keepShaderCode != 0) app->configuration.keepShaderCode = inputLaunchConfiguration.keepShaderCode;
25098 if (inputLaunchConfiguration.printMemoryLayout != 0) app->configuration.printMemoryLayout = inputLaunchConfiguration.printMemoryLayout;
25099 if (inputLaunchConfiguration.considerAllAxesStrided != 0) app->configuration.considerAllAxesStrided = inputLaunchConfiguration.considerAllAxesStrided;
25100 //temporary set:
25102#if(VKFFT_BACKEND==0)
25103 app->configuration.useUint64 = 0; //No physical addressing mode in Vulkan shaders. Use multiple-buffer support to achieve emulation of physical addressing.
25104#endif
25105 VkFFTResult resFFT = VKFFT_SUCCESS;
25106 uint64_t initSharedMemory = app->configuration.sharedMemorySize;
25108 app->localFFTPlan_inverse = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan));
25109 if (app->localFFTPlan_inverse) {
25110 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25111 app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
25112 resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, i, 0);
25113 if (resFFT != VKFFT_SUCCESS) {
25114 deleteVkFFT(app);
25115 return resFFT;
25116 }
25117 if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
25118 for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
25120 }
25121 }
25122 }
25123 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25124 app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
25125 for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
25126 resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 0);
25127 if (resFFT != VKFFT_SUCCESS) {
25128 deleteVkFFT(app);
25129 return resFFT;
25130 }
25131 }
25132 if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) {
25133 for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) {
25134 resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 1);
25135 if (resFFT != VKFFT_SUCCESS) {
25136 deleteVkFFT(app);
25137 return resFFT;
25138 }
25139 }
25140 }
25141 if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) {
25143 if (resFFT != VKFFT_SUCCESS) {
25144 deleteVkFFT(app);
25145 return resFFT;
25146 }
25147 }
25148 }
25149 }
25150 else {
25151 deleteVkFFT(app);
25153 }
25154 }
25156 app->localFFTPlan = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan));
25157 if (app->localFFTPlan) {
25158 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25159 app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
25160 resFFT = VkFFTScheduler(app, app->localFFTPlan, i, 0);
25161 if (resFFT != VKFFT_SUCCESS) {
25162 deleteVkFFT(app);
25163 return resFFT;
25164 }
25165 if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
25166 for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
25167 app->localFFTPlan->inverseBluesteinAxes[i][j] = app->localFFTPlan->axes[i][j];
25168 }
25169 }
25170 }
25171 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25172 app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory;
25173 for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) {
25174 resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 0);
25175 if (resFFT != VKFFT_SUCCESS) {
25176 deleteVkFFT(app);
25177 return resFFT;
25178 }
25179 }
25180 if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) {
25181 for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) {
25182 resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 1);
25183 if (resFFT != VKFFT_SUCCESS) {
25184 deleteVkFFT(app);
25185 return resFFT;
25186 }
25187 }
25188 }
25189 if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) {
25191 if (resFFT != VKFFT_SUCCESS) {
25192 deleteVkFFT(app);
25193 return resFFT;
25194 }
25195 }
25196 }
25197 }
25198 else {
25199 deleteVkFFT(app);
25201 }
25202 }
25203 for (uint64_t i = 0; i < app->configuration.FFTdim; i++) {
25204 if (app->useBluesteinFFT[i]) {
25206 resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan, i, 0);
25207 else
25208 resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan_inverse, i, 0);
25209 if (resFFT != VKFFT_SUCCESS) {
25210 deleteVkFFT(app);
25211 return resFFT;
25212 }
25213 }
25214 }
25215#if(VKFFT_BACKEND==0)
25217 glslang_finalize_process();
25219 }
25220#endif
25221 return resFFT;
25222}
25223static inline VkFFTResult dispatchEnhanced(VkFFTApplication* app, VkFFTAxis* axis, uint64_t* dispatchBlock) {
25224 VkFFTResult resFFT = VKFFT_SUCCESS;
25225 uint64_t maxBlockSize[3] = { (uint64_t)pow(2,(uint64_t)log2(app->configuration.maxComputeWorkGroupCount[0])),(uint64_t)pow(2,(uint64_t)log2(app->configuration.maxComputeWorkGroupCount[1])),(uint64_t)pow(2,(uint64_t)log2(app->configuration.maxComputeWorkGroupCount[2])) };
25226 uint64_t blockNumber[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)maxBlockSize[0]),(uint64_t)ceil(dispatchBlock[1] / (double)maxBlockSize[1]),(uint64_t)ceil(dispatchBlock[2] / (double)maxBlockSize[2]) };
25227 if (blockNumber[0] == 0) blockNumber[0] = 1;
25228 if (blockNumber[1] == 0) blockNumber[1] = 1;
25229 if (blockNumber[2] == 0) blockNumber[2] = 1;
25230 if ((blockNumber[0] > 1) && (blockNumber[0] * maxBlockSize[0] != dispatchBlock[0])) {
25231 for (uint64_t i = app->configuration.maxComputeWorkGroupCount[0]; i > 0; i--) {
25232 if (dispatchBlock[0] % i == 0) {
25233 maxBlockSize[0] = i;
25234 blockNumber[0] = dispatchBlock[0] / i;
25235 i = 1;
25236 }
25237 }
25238 }
25239 if ((blockNumber[1] > 1) && (blockNumber[1] * maxBlockSize[1] != dispatchBlock[1])) {
25240 for (uint64_t i = app->configuration.maxComputeWorkGroupCount[1]; i > 0; i--) {
25241 if (dispatchBlock[1] % i == 0) {
25242 maxBlockSize[1] = i;
25243 blockNumber[1] = dispatchBlock[1] / i;
25244 i = 1;
25245 }
25246 }
25247 }
25248 if ((blockNumber[2] > 1) && (blockNumber[2] * maxBlockSize[2] != dispatchBlock[2])) {
25249 for (uint64_t i = app->configuration.maxComputeWorkGroupCount[2]; i > 0; i--) {
25250 if (dispatchBlock[2] % i == 0) {
25251 maxBlockSize[2] = i;
25252 blockNumber[2] = dispatchBlock[2] / i;
25253 i = 1;
25254 }
25255 }
25256 }
25257 //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", dispatchBlock[0], dispatchBlock[1], dispatchBlock[2]);
25258 //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", blockNumber[0], blockNumber[1], blockNumber[2]);
25259 for (uint64_t i = 0; i < 3; i++)
25260 if (blockNumber[i] == 1) maxBlockSize[i] = dispatchBlock[i];
25261 for (uint64_t i = 0; i < blockNumber[0]; i++) {
25262 for (uint64_t j = 0; j < blockNumber[1]; j++) {
25263 for (uint64_t k = 0; k < blockNumber[2]; k++) {
25264 if (axis->pushConstants.workGroupShift[0] != i * maxBlockSize[0]) {
25265 axis->pushConstants.workGroupShift[0] = i * maxBlockSize[0];
25266 axis->updatePushConstants = 1;
25267 }
25268 if (axis->pushConstants.workGroupShift[1] != j * maxBlockSize[1]) {
25269 axis->pushConstants.workGroupShift[1] = j * maxBlockSize[1];
25270 axis->updatePushConstants = 1;
25271 }
25272 if (axis->pushConstants.workGroupShift[2] != k * maxBlockSize[2]) {
25273 axis->pushConstants.workGroupShift[2] = k * maxBlockSize[2];
25274 axis->updatePushConstants = 1;
25275 }
25276#if(VKFFT_BACKEND==0)
25277 size_t sizePushConsts = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
25278 if (app->configuration.useUint64) {
25279 vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)sizePushConsts, &axis->pushConstants);
25280 }
25281 else {
25282 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)axis->pushConstants.workGroupShift[0];
25283 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)axis->pushConstants.workGroupShift[1];
25284 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)axis->pushConstants.workGroupShift[2];
25285 vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)sizePushConsts, &axis->pushConstantsUint32);
25286 }
25287 vkCmdDispatch(app->configuration.commandBuffer[0], (uint32_t)maxBlockSize[0], (uint32_t)maxBlockSize[1], (uint32_t)maxBlockSize[2]);
25288#elif(VKFFT_BACKEND==1)
25289 void* args[6];
25290 CUresult result = CUDA_SUCCESS;
25291 args[0] = axis->inputBuffer;
25292 args[1] = axis->outputBuffer;
25293 uint64_t args_id = 2;
25294 if (axis->specializationConstants.convolutionStep) {
25295 args[args_id] = app->configuration.kernel;
25296 args_id++;
25297 }
25298 if (axis->specializationConstants.LUT) {
25299 args[args_id] = &axis->bufferLUT;
25300 args_id++;
25301 }
25302 if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) {
25303 if (axis->specializationConstants.inverseBluestein)
25304 args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id];
25305 else
25306 args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id];
25307 args_id++;
25308 }
25309 if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) {
25310 args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id];
25311 args_id++;
25312 }
25313 //args[args_id] = &axis->pushConstants;
25314 if (axis->updatePushConstants) {
25315 axis->updatePushConstants = 0;
25316 size_t sizePushConsts = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
25317 if (app->configuration.useUint64) {
25318 result = cuMemcpyHtoD(axis->consts_addr, &axis->pushConstants, sizePushConsts);
25319 }
25320 else {
25321 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)axis->pushConstants.workGroupShift[0];
25322 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)axis->pushConstants.workGroupShift[1];
25323 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)axis->pushConstants.workGroupShift[2];
25324 result = cuMemcpyHtoD(axis->consts_addr, &axis->pushConstantsUint32, sizePushConsts);
25325 }
25326 if (result != CUDA_SUCCESS) {
25327 printf("cuMemcpyHtoD error: %d\n", result);
25329 }
25330 }
25331 if (app->configuration.num_streams >= 1) {
25332 result = cuLaunchKernel(axis->VkFFTKernel,
25333 (unsigned int)maxBlockSize[0], (unsigned int)maxBlockSize[1], (unsigned int)maxBlockSize[2], // grid dim
25334 (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim
25335 (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream
25336 args, 0);
25337 }
25338 else {
25339 result = cuLaunchKernel(axis->VkFFTKernel,
25340 (unsigned int)maxBlockSize[0], (unsigned int)maxBlockSize[1], (unsigned int)maxBlockSize[2], // grid dim
25341 (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim
25342 (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream
25343 args, 0);
25344 }
25345 if (result != CUDA_SUCCESS) {
25346 printf("cuLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]);
25348 }
25349 if (app->configuration.num_streams > 1) {
25350 app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams;
25351 if (app->configuration.streamCounter == 0) {
25352 cudaError_t res2 = cudaEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]);
25353 if (res2 != cudaSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD;
25354 }
25355 app->configuration.streamCounter++;
25356 }
25357#elif(VKFFT_BACKEND==2)
25358 hipError_t result = hipSuccess;
25359 void* args[6];
25360 args[0] = axis->inputBuffer;
25361 args[1] = axis->outputBuffer;
25362 uint64_t args_id = 2;
25363 if (axis->specializationConstants.convolutionStep) {
25364 args[args_id] = app->configuration.kernel;
25365 args_id++;
25366 }
25367 if (axis->specializationConstants.LUT) {
25368 args[args_id] = &axis->bufferLUT;
25369 args_id++;
25370 }
25371 if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) {
25372 if (axis->specializationConstants.inverseBluestein)
25373 args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id];
25374 else
25375 args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id];
25376 args_id++;
25377 }
25378 if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) {
25379 args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id];
25380 args_id++;
25381 }
25382 //args[args_id] = &axis->pushConstants;
25383 if (axis->updatePushConstants) {
25384 axis->updatePushConstants = 0;
25385 size_t sizePushConsts = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
25386 if (app->configuration.useUint64) {
25387 result = hipMemcpyHtoD(axis->consts_addr, &axis->pushConstants, sizePushConsts);
25388 }
25389 else {
25390 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)axis->pushConstants.workGroupShift[0];
25391 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)axis->pushConstants.workGroupShift[1];
25392 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)axis->pushConstants.workGroupShift[2];
25393 result = hipMemcpyHtoD(axis->consts_addr, &axis->pushConstantsUint32, sizePushConsts);
25394 }
25395 if (result != hipSuccess) {
25396 printf("hipMemcpyHtoD error: %d\n", result);
25398 }
25399 }
25400 //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]);
25401 if (app->configuration.num_streams >= 1) {
25402 result = hipModuleLaunchKernel(axis->VkFFTKernel,
25403 (unsigned int)maxBlockSize[0], (unsigned int)maxBlockSize[1], (unsigned int)maxBlockSize[2], // grid dim
25404 (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim
25405 (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream
25406 args, 0);
25407 }
25408 else {
25409 result = hipModuleLaunchKernel(axis->VkFFTKernel,
25410 (unsigned int)maxBlockSize[0], (unsigned int)maxBlockSize[1], (unsigned int)maxBlockSize[2], // grid dim
25411 (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim
25412 (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream
25413 args, 0);
25414 }
25415 if (result != hipSuccess) {
25416 printf("hipModuleLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]);
25418 }
25419 if (app->configuration.num_streams > 1) {
25420 app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams;
25421 if (app->configuration.streamCounter == 0) {
25422 result = hipEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]);
25423 if (result != hipSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD;
25424 }
25425 app->configuration.streamCounter++;
25426 }
25427#elif(VKFFT_BACKEND==3)
25428 cl_int result = CL_SUCCESS;
25429 void* args[6];
25430 args[0] = axis->inputBuffer;
25431 result = clSetKernelArg(axis->kernel, 0, sizeof(cl_mem), args[0]);
25432 if (result != CL_SUCCESS) {
25434 }
25435 args[1] = axis->outputBuffer;
25436 result = clSetKernelArg(axis->kernel, 1, sizeof(cl_mem), args[1]);
25437 if (result != CL_SUCCESS) {
25439 }
25440 uint64_t args_id = 2;
25441 if (axis->specializationConstants.convolutionStep) {
25442 args[args_id] = app->configuration.kernel;
25443 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]);
25444 if (result != CL_SUCCESS) {
25446 }
25447 args_id++;
25448 }
25449 if (axis->specializationConstants.LUT) {
25450 args[args_id] = &axis->bufferLUT;
25451 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]);
25452 if (result != CL_SUCCESS) {
25454 }
25455 args_id++;
25456 }
25457 if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) {
25458 if (axis->specializationConstants.inverseBluestein)
25459 args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id];
25460 else
25461 args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id];
25462 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]);
25463 if (result != CL_SUCCESS) {
25465 }
25466 args_id++;
25467 }
25468 if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) {
25469 args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id];
25470 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]);
25471 if (result != CL_SUCCESS) {
25473 }
25474 args_id++;
25475 }
25476
25477 size_t sizePushConsts = (app->configuration.useUint64) ? sizeof(VkFFTPushConstantsLayoutUint64) : sizeof(VkFFTPushConstantsLayoutUint32);
25478 if (app->configuration.useUint64) {
25479 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizePushConsts, &axis->pushConstants);
25480 }
25481 else {
25482 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)axis->pushConstants.workGroupShift[0];
25483 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)axis->pushConstants.workGroupShift[1];
25484 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)axis->pushConstants.workGroupShift[2];
25485 result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizePushConsts, &axis->pushConstantsUint32);
25486 }
25487 if (result != CL_SUCCESS) {
25489 }
25490 args_id++;
25491 size_t local_work_size[3] = { (size_t)axis->specializationConstants.localSize[0], (size_t)axis->specializationConstants.localSize[1],(size_t)axis->specializationConstants.localSize[2] };
25492 size_t global_work_size[3] = { (size_t)maxBlockSize[0] * local_work_size[0] , (size_t)maxBlockSize[1] * local_work_size[1] ,(size_t)maxBlockSize[2] * local_work_size[2] };
25493 result = clEnqueueNDRangeKernel(app->configuration.commandQueue[0], axis->kernel, 3, 0, global_work_size, local_work_size, 0, 0, 0);
25494 //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]);
25495
25496 if (result != CL_SUCCESS) {
25498 }
25499#endif
25500 }
25501 }
25502 }
25503 return resFFT;
25504}
25506#if(VKFFT_BACKEND==0)
25507 vkCmdPipelineBarrier(app->configuration.commandBuffer[0], VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, app->configuration.memory_barrier, 0, 0, 0, 0);
25508#elif(VKFFT_BACKEND==1)
25509 if (app->configuration.num_streams > 1) {
25510 cudaError_t res = cudaSuccess;
25511 for (uint64_t s = 0; s < app->configuration.num_streams; s++) {
25512 res = cudaEventSynchronize(app->configuration.stream_event[s]);
25513 if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
25514 }
25515 app->configuration.streamCounter = 0;
25516 }
25517#elif(VKFFT_BACKEND==2)
25518 if (app->configuration.num_streams > 1) {
25519 hipError_t res = hipSuccess;
25520 for (uint64_t s = 0; s < app->configuration.num_streams; s++) {
25521 res = hipEventSynchronize(app->configuration.stream_event[s]);
25522 if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
25523 }
25524 app->configuration.streamCounter = 0;
25525 }
25526#elif(VKFFT_BACKEND==3)
25527#endif
25528 return VKFFT_SUCCESS;
25529}
25530static inline void printDebugInformation(VkFFTApplication* app, VkFFTAxis* axis) {
25531 if (app->configuration.keepShaderCode) printf("%s\n", axis->specializationConstants.code0);
25533 if ((axis->inputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer))
25534 printf("read: inputBuffer\n");
25535 if (axis->inputBuffer == app->configuration.buffer)
25536 printf("read: buffer\n");
25537 if (axis->inputBuffer == app->configuration.tempBuffer)
25538 printf("read: tempBuffer\n");
25539 if ((axis->inputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer))
25540 printf("read: outputBuffer\n");
25541 if ((axis->outputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer))
25542 printf("write: inputBuffer\n");
25543 if (axis->outputBuffer == app->configuration.buffer)
25544 printf("write: buffer\n");
25545 if (axis->outputBuffer == app->configuration.tempBuffer)
25546 printf("write: tempBuffer\n");
25547 if ((axis->outputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer))
25548 printf("write: outputBuffer\n");
25549 }
25550}
25551static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams) {
25552 VkFFTResult resFFT = VKFFT_SUCCESS;
25553#if(VKFFT_BACKEND==0)
25554 app->configuration.commandBuffer = launchParams->commandBuffer;
25555 VkMemoryBarrier memory_barrier = {
25556 VK_STRUCTURE_TYPE_MEMORY_BARRIER,
25557 0,
25558 VK_ACCESS_SHADER_WRITE_BIT,
25559 VK_ACCESS_SHADER_READ_BIT,
25560 };
25561 app->configuration.memory_barrier = &memory_barrier;
25562#elif(VKFFT_BACKEND==1)
25563 app->configuration.streamCounter = 0;
25564#elif(VKFFT_BACKEND==2)
25565 app->configuration.streamCounter = 0;
25566#elif(VKFFT_BACKEND==3)
25567 app->configuration.commandQueue = launchParams->commandQueue;
25568#endif
25569 uint64_t localSize0[3];
25572 if ((inverse != 1) && (!app->configuration.makeInversePlanOnly) && (!app->localFFTPlan)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED;
25573 if ((inverse == 1) && (!app->configuration.makeForwardPlanOnly) && (!app->localFFTPlan_inverse)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED;
25574 if (inverse == 1) {
25575 localSize0[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0];
25576 localSize0[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][0];
25577 localSize0[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][0];
25578 }
25579 else {
25580 localSize0[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0];
25581 localSize0[1] = app->localFFTPlan->actualFFTSizePerAxis[1][0];
25582 localSize0[2] = app->localFFTPlan->actualFFTSizePerAxis[2][0];
25583 }
25584 resFFT = VkFFTCheckUpdateBufferSet(app, 0, 0, launchParams);
25585 if (resFFT != VKFFT_SUCCESS) {
25586 return resFFT;
25587 }
25588 if (inverse != 1) {
25589 //FFT axis 0
25590 if (!app->configuration.omitDimension[0]) {
25591 for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) {
25592 VkFFTAxis* axis = &app->localFFTPlan->axes[0][l];
25593 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0);
25594 if (resFFT != VKFFT_SUCCESS) return resFFT;
25595 uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
25596#if(VKFFT_BACKEND==0)
25597 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25598 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25599#endif
25600 uint64_t dispatchBlock[3];
25601 if (l == 0) {
25602 if (app->localFFTPlan->numAxisUploads[0] > 2) {
25603 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
25604 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25605 }
25606 else {
25607 if (app->localFFTPlan->numAxisUploads[0] > 1) {
25608 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
25609 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25610 }
25611 else {
25612 dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim;
25613 dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
25614 }
25615 }
25616 }
25617 else {
25618 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]);
25619 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25620 }
25621 dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches;
25622 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25623 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25624 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25625 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25626 if (resFFT != VKFFT_SUCCESS) return resFFT;
25627 printDebugInformation(app, axis);
25628 resFFT = VkFFTSync(app);
25629 if (resFFT != VKFFT_SUCCESS) return resFFT;
25630 }
25631 if (app->useBluesteinFFT[0] && (app->localFFTPlan->numAxisUploads[0] > 1)) {
25632 for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[0]; l++) {
25633 VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[0][l];
25634 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 1);
25635 if (resFFT != VKFFT_SUCCESS) return resFFT;
25636 uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
25637#if(VKFFT_BACKEND==0)
25638 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25639 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25640#endif
25641 uint64_t dispatchBlock[3];
25642 if (l == 0) {
25643 if (app->localFFTPlan->numAxisUploads[0] > 2) {
25644 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
25645 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25646 }
25647 else {
25648 if (app->localFFTPlan->numAxisUploads[0] > 1) {
25649 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
25650 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25651 }
25652 else {
25653 dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim;
25654 dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
25655 }
25656 }
25657 }
25658 else {
25659 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]);
25660 dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1];
25661 }
25662 dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches;
25663 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25664 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25665 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25666 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25667 if (resFFT != VKFFT_SUCCESS) return resFFT;
25668 printDebugInformation(app, axis);
25669 resFFT = VkFFTSync(app);
25670 if (resFFT != VKFFT_SUCCESS) return resFFT;
25671 }
25672 }
25673 if (app->localFFTPlan->multiUploadR2C) {
25675 resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan, axis, 0, 0, 0);
25676 if (resFFT != VKFFT_SUCCESS) return resFFT;
25677 uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
25678
25679#if(VKFFT_BACKEND==0)
25680 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25681 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25682#endif
25683 uint64_t dispatchBlock[3];
25684
25685 dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0]));
25686 dispatchBlock[1] = 1;
25687 dispatchBlock[2] = maxCoordinate * app->configuration.numberBatches;
25688 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25689 if (resFFT != VKFFT_SUCCESS) return resFFT;
25690 printDebugInformation(app, axis);
25691 resFFT = VkFFTSync(app);
25692 if (resFFT != VKFFT_SUCCESS) return resFFT;
25693 //app->configuration.size[0] *= 2;
25694 }
25695 }
25696 if (app->configuration.FFTdim > 1) {
25697
25698 //FFT axis 1
25699 if (!app->configuration.omitDimension[1]) {
25700 if ((app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) {
25701
25702 for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) {
25703 VkFFTAxis* axis = &app->localFFTPlan->axes[1][l];
25704 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0);
25705 if (resFFT != VKFFT_SUCCESS) return resFFT;
25706 uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
25707
25708#if(VKFFT_BACKEND==0)
25709 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25710 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25711#endif
25712 uint64_t dispatchBlock[3];
25713 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
25714 dispatchBlock[1] = 1;
25715 dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * maxCoordinate * app->configuration.numberBatches;
25716 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25717 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25718 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25719 if (resFFT != VKFFT_SUCCESS) return resFFT;
25720 printDebugInformation(app, axis);
25721 resFFT = VkFFTSync(app);
25722 if (resFFT != VKFFT_SUCCESS) return resFFT;
25723 }
25724 }
25725 else {
25726
25727 for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) {
25728 VkFFTAxis* axis = &app->localFFTPlan->axes[1][l];
25729 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0);
25730 if (resFFT != VKFFT_SUCCESS) return resFFT;
25731
25732#if(VKFFT_BACKEND==0)
25733 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25734 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25735#endif
25736 uint64_t dispatchBlock[3];
25737
25738 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
25739 dispatchBlock[1] = 1;
25741 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25742 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25743 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25744 if (resFFT != VKFFT_SUCCESS) return resFFT;
25745 printDebugInformation(app, axis);
25746
25747 resFFT = VkFFTSync(app);
25748 if (resFFT != VKFFT_SUCCESS) return resFFT;
25749 }
25750 if (app->useBluesteinFFT[1] && (app->localFFTPlan->numAxisUploads[1] > 1)) {
25751 for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[1]; l++) {
25752 VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[1][l];
25753 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 1);
25754 if (resFFT != VKFFT_SUCCESS) return resFFT;
25755#if(VKFFT_BACKEND==0)
25756 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25757 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25758#endif
25759 uint64_t dispatchBlock[3];
25760 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
25761 dispatchBlock[1] = 1;
25763
25764 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25765 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25766 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25767 if (resFFT != VKFFT_SUCCESS) return resFFT;
25768 printDebugInformation(app, axis);
25769 resFFT = VkFFTSync(app);
25770 if (resFFT != VKFFT_SUCCESS) return resFFT;
25771 }
25772 }
25773 }
25774 }
25775 }
25776 //FFT axis 2
25777 if (app->configuration.FFTdim > 2) {
25778 if (!app->configuration.omitDimension[2]) {
25779 if ((app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) {
25780
25781 for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) {
25782
25783 VkFFTAxis* axis = &app->localFFTPlan->axes[2][l];
25784 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0);
25785 if (resFFT != VKFFT_SUCCESS) return resFFT;
25786 uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
25787#if(VKFFT_BACKEND==0)
25788 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25789 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25790#endif
25791 uint64_t dispatchBlock[3];
25792 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim);
25793 dispatchBlock[1] = 1;
25794 dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * maxCoordinate * app->configuration.numberBatches;
25795 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25796 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25797 if (resFFT != VKFFT_SUCCESS) return resFFT;
25798 printDebugInformation(app, axis);
25799 resFFT = VkFFTSync(app);
25800 if (resFFT != VKFFT_SUCCESS) return resFFT;
25801 }
25802 }
25803 else {
25804
25805 for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) {
25806 VkFFTAxis* axis = &app->localFFTPlan->axes[2][l];
25807 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0);
25808 if (resFFT != VKFFT_SUCCESS) return resFFT;
25809#if(VKFFT_BACKEND==0)
25810 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25811 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25812#endif
25813 uint64_t dispatchBlock[3];
25814 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim);
25815 dispatchBlock[1] = 1;
25817 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25818 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25819 if (resFFT != VKFFT_SUCCESS) return resFFT;
25820 printDebugInformation(app, axis);
25821 resFFT = VkFFTSync(app);
25822 if (resFFT != VKFFT_SUCCESS) return resFFT;
25823 }
25824 if (app->useBluesteinFFT[2] && (app->localFFTPlan->numAxisUploads[2] > 1)) {
25825 for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[2]; l++) {
25826 VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[2][l];
25827 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 1);
25828 if (resFFT != VKFFT_SUCCESS) return resFFT;
25829#if(VKFFT_BACKEND==0)
25830 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25831 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25832#endif
25833 uint64_t dispatchBlock[3];
25834 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim);
25835 dispatchBlock[1] = 1;
25837
25838 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25839 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25840 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25841 if (resFFT != VKFFT_SUCCESS) return resFFT;
25842 printDebugInformation(app, axis);
25843 resFFT = VkFFTSync(app);
25844 if (resFFT != VKFFT_SUCCESS) return resFFT;
25845 }
25846 }
25847 }
25848 }
25849 }
25850 }
25852 if (app->configuration.FFTdim > 2) {
25853
25854 //multiple upload ifft leftovers
25855 if (app->configuration.FFTdim == 3) {
25856
25857 for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[2]; l++) {
25858 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l];
25859 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1);
25860 if (resFFT != VKFFT_SUCCESS) return resFFT;
25861
25862#if(VKFFT_BACKEND==0)
25863 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25864 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25865#endif
25866 uint64_t dispatchBlock[3];
25867 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim);
25868 dispatchBlock[1] = 1;
25870 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25871 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25872 if (resFFT != VKFFT_SUCCESS) return resFFT;
25873 printDebugInformation(app, axis);
25874 resFFT = VkFFTSync(app);
25875 if (resFFT != VKFFT_SUCCESS) return resFFT;
25876 }
25877 }
25878
25879 for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) {
25880 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
25881 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1);
25882 if (resFFT != VKFFT_SUCCESS) return resFFT;
25883
25884#if(VKFFT_BACKEND==0)
25885 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25886 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25887#endif
25888 uint64_t dispatchBlock[3];
25889 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
25890 dispatchBlock[1] = 1;
25892 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25893 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25894 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25895 if (resFFT != VKFFT_SUCCESS) return resFFT;
25896 printDebugInformation(app, axis);
25897 resFFT = VkFFTSync(app);
25898 if (resFFT != VKFFT_SUCCESS) return resFFT;
25899 }
25900
25901 }
25902 if (app->configuration.FFTdim > 1) {
25903 if (app->configuration.FFTdim == 2) {
25904
25905 for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) {
25906 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
25907 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1);
25908 if (resFFT != VKFFT_SUCCESS) return resFFT;
25909
25910#if(VKFFT_BACKEND==0)
25911 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25912 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25913#endif
25914 uint64_t dispatchBlock[3];
25915 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
25916 dispatchBlock[1] = 1;
25918 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25919 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25920 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25921 if (resFFT != VKFFT_SUCCESS) return resFFT;
25922 printDebugInformation(app, axis);
25923 resFFT = VkFFTSync(app);
25924 if (resFFT != VKFFT_SUCCESS) return resFFT;
25925 }
25926 }
25927 for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
25928 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
25929 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
25930 if (resFFT != VKFFT_SUCCESS) return resFFT;
25931
25932#if(VKFFT_BACKEND==0)
25933 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25934 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25935#endif
25936 uint64_t dispatchBlock[3];
25937 if (l == 0) {
25938 if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
25939 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
25940 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
25941 }
25942 else {
25943 if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
25944 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
25945 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
25946 }
25947 else {
25948 dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim;
25949 dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
25950 }
25951 }
25952 }
25953 else {
25954 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]);
25955 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
25956 }
25958 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25959 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25960 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25961 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25962 if (resFFT != VKFFT_SUCCESS) return resFFT;
25963 printDebugInformation(app, axis);
25964 resFFT = VkFFTSync(app);
25965 if (resFFT != VKFFT_SUCCESS) return resFFT;
25966 }
25967 }
25968 if (app->configuration.FFTdim == 1) {
25969 for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
25970 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
25971 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
25972 if (resFFT != VKFFT_SUCCESS) return resFFT;
25973
25974#if(VKFFT_BACKEND==0)
25975 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
25976 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25977#endif
25978 uint64_t dispatchBlock[3];
25979 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->specializationConstants.fftDim);
25980 dispatchBlock[1] = 1;
25982 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
25983 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
25984 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
25985 if (resFFT != VKFFT_SUCCESS) return resFFT;
25986 printDebugInformation(app, axis);
25987 resFFT = VkFFTSync(app);
25988 if (resFFT != VKFFT_SUCCESS) return resFFT;
25989 }
25990 }
25991 }
25992
25993 if (inverse == 1) {
25994 //we start from axis 2 and go back to axis 0
25995 //FFT axis 2
25996 if (app->configuration.FFTdim > 2) {
25997 if (!app->configuration.omitDimension[2]) {
25998 for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[2] - 1; l >= 0; l--) {
25999 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
26000 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l];
26001 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1);
26002 if (resFFT != VKFFT_SUCCESS) return resFFT;
26003
26004#if(VKFFT_BACKEND==0)
26005 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
26006 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26007#endif
26008 uint64_t dispatchBlock[3];
26009 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim);
26010 dispatchBlock[1] = 1;
26012 //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
26013 //if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26014
26015 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
26016 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
26017 if (resFFT != VKFFT_SUCCESS) return resFFT;
26018 printDebugInformation(app, axis);
26019 resFFT = VkFFTSync(app);
26020 if (resFFT != VKFFT_SUCCESS) return resFFT;
26021 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
26022 }
26023 }
26024 }
26025 if (app->configuration.FFTdim > 1) {
26026
26027 //FFT axis 1
26028 if (!app->configuration.omitDimension[1]) {
26029 for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[1] - 1; l >= 0; l--) {
26030 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l;
26031 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
26032 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1);
26033 if (resFFT != VKFFT_SUCCESS) return resFFT;
26034
26035#if(VKFFT_BACKEND==0)
26036 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
26037 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26038#endif
26039 uint64_t dispatchBlock[3];
26040 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim);
26041 dispatchBlock[1] = 1;
26043 //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
26044 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
26045 //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0);
26046
26047 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
26048 if (resFFT != VKFFT_SUCCESS) return resFFT;
26049 printDebugInformation(app, axis);
26050 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l;
26051 resFFT = VkFFTSync(app);
26052 if (resFFT != VKFFT_SUCCESS) return resFFT;
26053 }
26054 }
26055 }
26056 if (!app->configuration.omitDimension[0]) {
26058 //app->configuration.size[0] /= 2;
26061 if (resFFT != VKFFT_SUCCESS) return resFFT;
26062
26063#if(VKFFT_BACKEND==0)
26064 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
26065 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26066#endif
26067 uint64_t dispatchBlock[3];
26068
26069 dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0]));
26070 dispatchBlock[1] = 1;
26071 dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches;
26072 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
26073 if (resFFT != VKFFT_SUCCESS) return resFFT;
26074 printDebugInformation(app, axis);
26075
26076 resFFT = VkFFTSync(app);
26077 if (resFFT != VKFFT_SUCCESS) return resFFT;
26078 }
26079 //FFT axis 0
26080 for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) {
26081 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l;
26082 VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
26083 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
26084 if (resFFT != VKFFT_SUCCESS) return resFFT;
26085#if(VKFFT_BACKEND==0)
26086 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
26087 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26088#endif
26089 uint64_t dispatchBlock[3];
26090 if (l == 0) {
26091 if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
26092 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
26093 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26094 }
26095 else {
26096 if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
26097 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
26098 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26099 }
26100 else {
26101 dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim;
26102 dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
26103 }
26104 }
26105 }
26106 else {
26107 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]);
26108 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26109 }
26111 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26112 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26113 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
26114 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
26115 if (resFFT != VKFFT_SUCCESS) return resFFT;
26116 printDebugInformation(app, axis);
26117 if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l;
26118 resFFT = VkFFTSync(app);
26119 if (resFFT != VKFFT_SUCCESS) return resFFT;
26120 }
26121 if (app->useBluesteinFFT[0] && (app->localFFTPlan_inverse->numAxisUploads[0] > 1)) {
26122 for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) {
26124 resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1);
26125 if (resFFT != VKFFT_SUCCESS) return resFFT;
26126
26127#if(VKFFT_BACKEND==0)
26128 vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline);
26129 vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26130#endif
26131 uint64_t dispatchBlock[3];
26132 if (l == 0) {
26133 if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
26134 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
26135 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26136 }
26137 else {
26138 if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
26139 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
26140 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26141 }
26142 else {
26143 dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim;
26144 dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]);
26145 }
26146 }
26147 }
26148 else {
26149 dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]);
26150 dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1];
26151 }
26153 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26154 //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26155 //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0);
26156 resFFT = dispatchEnhanced(app, axis, dispatchBlock);
26157 if (resFFT != VKFFT_SUCCESS) return resFFT;
26158 printDebugInformation(app, axis);
26159 resFFT = VkFFTSync(app);
26160 if (resFFT != VKFFT_SUCCESS) return resFFT;
26161 }
26162 }
26163 }
26164 //if (app->localFFTPlan_inverse->multiUploadR2C) app->configuration.size[0] *= 2;
26165
26166 }
26167 return resFFT;
26168}
26169static inline int VkFFTGetVersion() {
26170 return 10214; //X.XX.XX format
26171}
26172#endif
static VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in)
Definition VkFFT_Base.h:119
static VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
Definition VkFFT_Base.h:245
static int VkFFTGetVersion()
static VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in)
Definition VkFFT_Base.h:111
static VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult shaderGenVkFFT_R2C_decomposition(char *output, VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory, const char *uintType, uint64_t type)
static VkFFTResult transferDataFromCPU(VkFFTApplication *app, void *arr, VkBuffer *buffer, VkDeviceSize bufferSize)
static VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t reorderType)
static VkFFTResult appendZeropadEndReadWriteStage(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext)
static VkFFTResult appendLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc)
Definition VkFFT_Base.h:420
static VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *id)
Definition VkFFT_Base.h:135
static VkFFTResult appendVersion(VkFFTSpecializationConstantsLayout *sc)
Definition VkFFT_Base.h:373
static VkFFTResult VkFFTSync(VkFFTApplication *app)
static VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory)
Definition VkFFT_Base.h:905
static VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:178
static VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t shuffleType, uint64_t start)
static VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t sharedType)
static VkFFTResult dispatchEnhanced(VkFFTApplication *app, VkFFTAxis *axis, uint64_t *dispatchBlock)
static VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication *app, VkFFTPlan *FFTPlan, VkFFTAxis *axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse)
static VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory)
Definition VkFFT_Base.h:382
static VkFFTResult VkFFTGetRegistersPerThread(uint64_t *loc_multipliers, uint64_t *registers_per_thread_per_radix, uint64_t *registers_per_thread, uint64_t *min_registers_per_thread, uint64_t *isGoodSequence)
static VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
Definition VkFFT_Base.h:275
static VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t writeType)
static VkFFTResult appendLicense(VkFFTSpecializationConstantsLayout *sc)
Definition VkFFT_Base.h:83
static void freeShaderGenVkFFT(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult initializeVkFFT(VkFFTApplication *app, VkFFTConfiguration inputLaunchConfiguration)
static VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
Definition VkFFT_Base.h:540
static VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
Definition VkFFT_Base.h:476
static VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:169
static VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t readType)
static VkFFTResult appendCoordinateRegisterStore(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication *app, VkFFTPlan *FFTPlan, VkFFTAxis *axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse)
static VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
Definition VkFFT_Base.h:559
static VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout *sc, const char *id, const char *in)
Definition VkFFT_Base.h:127
static VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix)
static void deleteAxis(VkFFTApplication *app, VkFFTAxis *axis)
static VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *in_2)
Definition VkFFT_Base.h:186
static VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatType)
static VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory, uint64_t outputType)
Definition VkFFT_Base.h:792
static VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
Definition VkFFT_Base.h:315
static VkFFTResult setWriteFromRegisters(VkFFTSpecializationConstantsLayout *sc, uint64_t writeType)
static VkFFTResult VkAppendLineFromInput(VkFFTSpecializationConstantsLayout *sc, const char *in)
Definition VkFFT_Base.h:77
static VkFFTResult findMemoryType(VkFFTApplication *app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t *memoryTypeIndex)
static VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
Definition VkFFT_Base.h:224
static VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t initType)
static VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
Definition VkFFT_Base.h:294
static VkFFTResult appendZeropadStartReadWriteStage(VkFFTSpecializationConstantsLayout *sc, uint64_t readStage)
static VkFFTResult shaderGenVkFFT(char *output, VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory, const char *uintType, uint64_t type)
static VkFFTResult VkFFTPlanAxis(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload)
static VkFFTResult VkFFTScheduler(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t supportAxis)
static VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout *sc, const char *floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, double stageAngle)
static VkFFTResult VkFFTAppend(VkFFTApplication *app, int inverse, VkFFTLaunchParams *launchParams)
static VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatType)
Definition VkFFT_Base.h:979
static VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:160
static VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t reorderType)
static void deleteVkFFT(VkFFTApplication *app)
static VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t readType)
static VkFFTResult setReadToRegisters(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult appendZeropadEnd(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult appendBarrierVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t numTab)
Definition VkFFT_Base.h:452
static VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout *sc, const char *type, const char *name)
Definition VkFFT_Base.h:445
static VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory, uint64_t inputType)
Definition VkFFT_Base.h:679
static VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
Definition VkFFT_Base.h:203
static VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
Definition VkFFT_Base.h:336
static VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t shuffleType)
static VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext)
static VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeDifferent)
Definition VkFFT_Base.h:634
static VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t inverse)
static VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
Definition VkFFT_Base.h:344
static VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t shuffleType)
static VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout *sc, const char *type, const char *name, const char *defaultVal, const char *LFending)
Definition VkFFT_Base.h:432
static VkFFTResult appendZeropadStart(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *in_2)
Definition VkFFT_Base.h:195
static VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t strideType, uint64_t pre_or_post_multiplication)
static VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *uintType, uint64_t outputType, const char *index_x, const char *index_y, const char *coordinate, const char *batchID)
static VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *uintType, uint64_t inputType, const char *index_x, const char *index_y, const char *coordinate, const char *batchID)
static VkFFTResult allocateFFTBuffer(VkFFTApplication *app, VkBuffer *buffer, VkDeviceMemory *deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, VkDeviceSize size)
static VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout *sc, const uint64_t *permute, const uint64_t num_elem, const uint64_t type, char **regIDs)
Definition VkFFT_Base.h:352
static VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t radix, uint64_t stageSize, double stageAngle, char **regID)
static VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:151
static VkFFTResult appendCoordinateRegisterPull(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication *app, VkFFTAxis *axis, uint64_t planStage, VkFFTLaunchParams *launchParams)
static VkFFTResult VkAppendLine(VkFFTSpecializationConstantsLayout *sc)
Definition VkFFT_Base.h:70
static VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:143
static VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *temp)
Definition VkFFT_Base.h:254
static VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
Definition VkFFT_Base.h:285
static void printDebugInformation(VkFFTApplication *app, VkFFTAxis *axis)
static VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t supportAxis)
static VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix)
VkFFTResult
Definition VkFFT_Defs.h:232
@ VKFFT_ERROR_EMPTY_inputBuffer
Definition VkFFT_Defs.h:255
@ VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM
Definition VkFFT_Defs.h:294
@ VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE
Definition VkFFT_Defs.h:313
@ VKFFT_ERROR_FAILED_TO_GET_FUNCTION
Definition VkFFT_Defs.h:300
@ VKFFT_ERROR_INVALID_FENCE
Definition VkFFT_Defs.h:243
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH
Definition VkFFT_Defs.h:261
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY
Definition VkFFT_Defs.h:289
@ VKFFT_ERROR_INVALID_PLATFORM
Definition VkFFT_Defs.h:247
@ VKFFT_ERROR_FAILED_TO_RESET_FENCES
Definition VkFFT_Defs.h:272
@ VKFFT_ERROR_UNSUPPORTED_RADIX
Definition VkFFT_Defs.h:260
@ VKFFT_ERROR_FAILED_TO_COPY
Definition VkFFT_Defs.h:293
@ VKFFT_ERROR_FAILED_SHADER_PREPROCESS
Definition VkFFT_Defs.h:277
@ VKFFT_ERROR_EMPTY_FFTdim
Definition VkFFT_Defs.h:248
@ VKFFT_ERROR_EMPTY_kernel
Definition VkFFT_Defs.h:259
@ VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER
Definition VkFFT_Defs.h:235
@ VKFFT_ERROR_EMPTY_size
Definition VkFFT_Defs.h:249
@ VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG
Definition VkFFT_Defs.h:311
@ VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE
Definition VkFFT_Defs.h:296
@ VKFFT_ERROR_EMPTY_inputBufferSize
Definition VkFFT_Defs.h:254
@ VKFFT_ERROR_EMPTY_tempBuffer
Definition VkFFT_Defs.h:253
@ VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE
Definition VkFFT_Defs.h:281
@ VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION
Definition VkFFT_Defs.h:305
@ VKFFT_ERROR_EMPTY_outputBuffer
Definition VkFFT_Defs.h:257
@ VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER
Definition VkFFT_Defs.h:268
@ VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER
Definition VkFFT_Defs.h:269
@ VKFFT_SUCCESS
Definition VkFFT_Defs.h:233
@ VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER
Definition VkFFT_Defs.h:236
@ VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE
Definition VkFFT_Defs.h:312
@ VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL
Definition VkFFT_Defs.h:302
@ VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM
Definition VkFFT_Defs.h:298
@ VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY
Definition VkFFT_Defs.h:290
@ VKFFT_ERROR_FAILED_TO_GET_CODE
Definition VkFFT_Defs.h:297
@ VKFFT_ERROR_PLAN_NOT_INITIALIZED
Definition VkFFT_Defs.h:237
@ VKFFT_ERROR_INVALID_COMMAND_POOL
Definition VkFFT_Defs.h:242
@ VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES
Definition VkFFT_Defs.h:271
@ VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL
Definition VkFFT_Defs.h:273
@ VKFFT_ERROR_EMPTY_bufferSize
Definition VkFFT_Defs.h:250
@ VKFFT_ERROR_EMPTY_tempBufferSize
Definition VkFFT_Defs.h:252
@ VKFFT_ERROR_FAILED_TO_CREATE_BUFFER
Definition VkFFT_Defs.h:288
@ VKFFT_ERROR_INVALID_CONTEXT
Definition VkFFT_Defs.h:246
@ VKFFT_ERROR_FAILED_TO_INITIALIZE
Definition VkFFT_Defs.h:306
@ VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL
Definition VkFFT_Defs.h:303
@ VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED
Definition VkFFT_Defs.h:245
@ VKFFT_ERROR_EMPTY_buffer
Definition VkFFT_Defs.h:251
@ VKFFT_ERROR_INVALID_DEVICE
Definition VkFFT_Defs.h:240
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS
Definition VkFFT_Defs.h:275
@ VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE
Definition VkFFT_Defs.h:315
@ VKFFT_ERROR_FAILED_TO_CREATE_EVENT
Definition VkFFT_Defs.h:316
@ VKFFT_ERROR_FAILED_TO_SYNCHRONIZE
Definition VkFFT_Defs.h:292
@ VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT
Definition VkFFT_Defs.h:276
@ VKFFT_ERROR_UNSUPPORTED_FFT_OMIT
Definition VkFFT_Defs.h:264
@ VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE
Definition VkFFT_Defs.h:310
@ VKFFT_ERROR_FAILED_TO_LOAD_MODULE
Definition VkFFT_Defs.h:299
@ VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT
Definition VkFFT_Defs.h:274
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT
Definition VkFFT_Defs.h:263
@ VKFFT_ERROR_FAILED_TO_ALLOCATE
Definition VkFFT_Defs.h:265
@ VKFFT_ERROR_EMPTY_kernelSize
Definition VkFFT_Defs.h:258
@ VKFFT_ERROR_FAILED_TO_MAP_MEMORY
Definition VkFFT_Defs.h:266
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C
Definition VkFFT_Defs.h:262
@ VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE
Definition VkFFT_Defs.h:270
@ VKFFT_ERROR_FAILED_TO_FIND_MEMORY
Definition VkFFT_Defs.h:291
@ VKFFT_ERROR_FAILED_SHADER_LINK
Definition VkFFT_Defs.h:279
@ VKFFT_ERROR_INVALID_QUEUE
Definition VkFFT_Defs.h:241
@ VKFFT_ERROR_MALLOC_FAILED
Definition VkFFT_Defs.h:234
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS
Definition VkFFT_Defs.h:267
@ VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED
Definition VkFFT_Defs.h:244
@ VKFFT_ERROR_FAILED_TO_EVENT_RECORD
Definition VkFFT_Defs.h:304
@ VKFFT_ERROR_FAILED_SHADER_PARSE
Definition VkFFT_Defs.h:278
@ VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM
Definition VkFFT_Defs.h:295
@ VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY
Definition VkFFT_Defs.h:301
@ VKFFT_ERROR_INVALID_PHYSICAL_DEVICE
Definition VkFFT_Defs.h:239
@ VKFFT_ERROR_NULL_TEMP_PASSED
Definition VkFFT_Defs.h:238
@ VKFFT_ERROR_EMPTY_outputBufferSize
Definition VkFFT_Defs.h:256
VkFFTConfiguration configuration
Definition VkFFT_Defs.h:526
VkBuffer bufferBluesteinFFT[3]
Definition VkFFT_Defs.h:540
uint64_t lastAxis
Definition VkFFT_Defs.h:532
uint64_t useBluesteinFFT[3]
Definition VkFFT_Defs.h:534
VkBuffer bufferBluestein[3]
Definition VkFFT_Defs.h:539
VkBuffer bufferBluesteinIFFT[3]
Definition VkFFT_Defs.h:541
uint64_t actualNumBatches
Definition VkFFT_Defs.h:530
VkDeviceMemory bufferBluesteinIFFTDeviceMemory[3]
Definition VkFFT_Defs.h:538
VkDeviceMemory bufferBluesteinFFTDeviceMemory[3]
Definition VkFFT_Defs.h:537
uint64_t bufferBluesteinSize[3]
Definition VkFFT_Defs.h:555
VkFFTPlan * localFFTPlan_inverse
Definition VkFFT_Defs.h:528
VkFFTPlan * localFFTPlan
Definition VkFFT_Defs.h:527
VkDeviceMemory bufferBluesteinDeviceMemory[3]
Definition VkFFT_Defs.h:536
uint64_t firstAxis
Definition VkFFT_Defs.h:531
VkDeviceMemory bufferLUTDeviceMemory
Definition VkFFT_Defs.h:477
uint64_t bufferLUTSize
Definition VkFFT_Defs.h:510
VkBuffer bufferLUT
Definition VkFFT_Defs.h:478
VkFFTSpecializationConstantsLayout specializationConstants
Definition VkFFT_Defs.h:465
uint64_t inputBufferOffset
Definition VkFFT_Defs.h:106
uint64_t printMemoryLayout
Definition VkFFT_Defs.h:143
uint64_t numberBatches
Definition VkFFT_Defs.h:115
uint64_t outputBufferOffset
Definition VkFFT_Defs.h:107
uint64_t disableMergeSequencesR2C
Definition VkFFT_Defs.h:128
uint64_t registerBoost4Step
Definition VkFFT_Defs.h:164
uint64_t sharedMemorySize
Definition VkFFT_Defs.h:176
uint64_t makeForwardPlanOnly
Definition VkFFT_Defs.h:132
uint64_t isCompilerInitialized
Definition VkFFT_Defs.h:46
VkBuffer * outputBuffer
Definition VkFFT_Defs.h:83
VkBuffer * inputBuffer
Definition VkFFT_Defs.h:82
uint64_t devicePageSize
Definition VkFFT_Defs.h:168
VkBuffer * kernel
Definition VkFFT_Defs.h:84
uint64_t fixMaxRadixBluestein
Definition VkFFT_Defs.h:118
uint64_t coordinateFeatures
Definition VkFFT_Defs.h:155
uint64_t isOutputFormatted
Definition VkFFT_Defs.h:137
VkMemoryBarrier * memory_barrier
Definition VkFFT_Defs.h:187
uint64_t maxComputeWorkGroupSize[3]
Definition VkFFT_Defs.h:173
uint64_t * bufferSize
Definition VkFFT_Defs.h:73
uint64_t doublePrecisionFloatMemory
Definition VkFFT_Defs.h:124
uint64_t makeInversePlanOnly
Definition VkFFT_Defs.h:133
uint64_t inputBufferNum
Definition VkFFT_Defs.h:68
uint64_t * inputBufferSize
Definition VkFFT_Defs.h:75
uint64_t localPageSize
Definition VkFFT_Defs.h:169
VkCommandPool * commandPool
Definition VkFFT_Defs.h:44
uint64_t numberKernels
Definition VkFFT_Defs.h:158
uint64_t halfPrecisionMemoryOnly
Definition VkFFT_Defs.h:123
uint64_t keepShaderCode
Definition VkFFT_Defs.h:142
uint64_t symmetricKernel
Definition VkFFT_Defs.h:157
VkPhysicalDevice * physicalDevice
Definition VkFFT_Defs.h:41
uint64_t swapTo3Stage4Step
Definition VkFFT_Defs.h:167
uint64_t isInputFormatted
Definition VkFFT_Defs.h:136
uint64_t matrixConvolution
Definition VkFFT_Defs.h:156
uint64_t coalescedMemory
Definition VkFFT_Defs.h:111
uint64_t size[3]
Definition VkFFT_Defs.h:38
uint64_t outputBufferNum
Definition VkFFT_Defs.h:69
uint64_t performConvolution
Definition VkFFT_Defs.h:152
uint64_t frequencyZeroPadding
Definition VkFFT_Defs.h:149
uint64_t registerBoostNonPow2
Definition VkFFT_Defs.h:163
uint64_t tempBufferNum
Definition VkFFT_Defs.h:67
uint64_t tempBufferOffset
Definition VkFFT_Defs.h:105
uint64_t considerAllAxesStrided
Definition VkFFT_Defs.h:141
uint64_t doublePrecision
Definition VkFFT_Defs.h:121
uint64_t bufferStride[3]
Definition VkFFT_Defs.h:135
uint64_t omitDimension[3]
Definition VkFFT_Defs.h:117
uint64_t sharedMemorySizePow2
Definition VkFFT_Defs.h:177
uint64_t performBandwidthBoost
Definition VkFFT_Defs.h:119
uint64_t fft_zeropad_left[3]
Definition VkFFT_Defs.h:147
uint64_t registerBoost
Definition VkFFT_Defs.h:162
uint64_t halfPrecision
Definition VkFFT_Defs.h:122
VkDeviceMemory tempBufferDeviceMemory
Definition VkFFT_Defs.h:185
uint64_t numSharedBanks
Definition VkFFT_Defs.h:113
uint64_t * tempBufferSize
Definition VkFFT_Defs.h:74
VkBuffer * tempBuffer
Definition VkFFT_Defs.h:81
VkCommandBuffer * commandBuffer
Definition VkFFT_Defs.h:186
uint64_t allocateTempBuffer
Definition VkFFT_Defs.h:180
uint64_t performZeropadding[3]
Definition VkFFT_Defs.h:146
uint64_t bufferOffset
Definition VkFFT_Defs.h:104
uint64_t sharedMemorySizeStatic
Definition VkFFT_Defs.h:175
uint64_t kernelOffset
Definition VkFFT_Defs.h:108
uint64_t inverseReturnToInputBuffer
Definition VkFFT_Defs.h:114
VkBuffer * buffer
Definition VkFFT_Defs.h:80
uint64_t maxThreadsNum
Definition VkFFT_Defs.h:174
uint64_t userTempBuffer
Definition VkFFT_Defs.h:64
uint64_t disableReorderFourStep
Definition VkFFT_Defs.h:130
uint64_t inputBufferStride[3]
Definition VkFFT_Defs.h:138
uint64_t kernelConvolution
Definition VkFFT_Defs.h:159
uint64_t crossPowerSpectrumNormalization
Definition VkFFT_Defs.h:154
uint64_t outputBufferStride[3]
Definition VkFFT_Defs.h:139
uint64_t * outputBufferSize
Definition VkFFT_Defs.h:76
uint64_t * kernelSize
Definition VkFFT_Defs.h:77
uint64_t conjugateConvolution
Definition VkFFT_Defs.h:153
uint64_t maxComputeWorkGroupCount[3]
Definition VkFFT_Defs.h:172
uint64_t fft_zeropad_right[3]
Definition VkFFT_Defs.h:148
uint64_t reorderFourStep
Definition VkFFT_Defs.h:181
VkDevice * device
Definition VkFFT_Defs.h:42
VkBuffer * outputBuffer
Definition VkFFT_Defs.h:208
VkBuffer * inputBuffer
Definition VkFFT_Defs.h:207
VkBuffer * kernel
Definition VkFFT_Defs.h:209
VkCommandBuffer * commandBuffer
Definition VkFFT_Defs.h:203
VkBuffer * buffer
Definition VkFFT_Defs.h:205
VkBuffer * tempBuffer
Definition VkFFT_Defs.h:206
VkFFTAxis axes[3][4]
Definition VkFFT_Defs.h:518
uint64_t numAxisUploads[3]
Definition VkFFT_Defs.h:516
VkFFTAxis R2Cdecomposition
Definition VkFFT_Defs.h:522
uint64_t axisSplit[3][4]
Definition VkFFT_Defs.h:517
VkFFTAxis inverseBluesteinAxes[3][4]
Definition VkFFT_Defs.h:523
uint64_t actualFFTSizePerAxis[3][3]
Definition VkFFT_Defs.h:515
uint64_t multiUploadR2C
Definition VkFFT_Defs.h:520
uint64_t actualPerformR2CPerAxis[3]
Definition VkFFT_Defs.h:521