1297    char LFending[4] = 
"";
 
 1298    if (!strcmp(floatType, 
"float")) sprintf(LFending, 
"f");
 
 1299#if(VKFFT_BACKEND==0) 
 1300    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"vec2");
 
 1301    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"dvec2");
 
 1302    char cosDef[20] = 
"cos";
 
 1303    char sinDef[20] = 
"sin";
 
 1304    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"LF");
 
 1305#elif(VKFFT_BACKEND==1) 
 1306    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 1307    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 1308    char cosDef[20] = 
"__cosf";
 
 1309    char sinDef[20] = 
"__sinf";
 
 1310    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
 1311#elif(VKFFT_BACKEND==2) 
 1312    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 1313    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 1314    char cosDef[20] = 
"__cosf";
 
 1315    char sinDef[20] = 
"__sinf";
 
 1316    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
 1317#elif(VKFFT_BACKEND==3) 
 1318    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 1319    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 1320    char cosDef[20] = 
"native_cos";
 
 1321    char sinDef[20] = 
"native_sin";
 
 1324    char* temp = sc->
temp;
 
 1330    char convolutionInverse[30] = 
"";
 
 1331    if (sc->
convolutionStep) sprintf(convolutionInverse, 
", %s inverse", uintType);
 
 1347            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 1357            if (!strcmp(floatType, 
"float")) {
 
 1358                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle);\n", w, cosDef);
 
 1361                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle);\n", w, sinDef);
 
 1365            if (!strcmp(floatType, 
"double")) {
 
 1366                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle);\n", w);
 
 1395        for (uint64_t i = 0; i < 2; i++) {
 
 1396            tf[i] = (
char*)malloc(
sizeof(
char) * 50);
 
 1398                for (uint64_t j = 0; j < i; j++) {
 
 1406        sprintf(tf[0], 
"-0.5%s", LFending);
 
 1407        sprintf(tf[1], 
"-0.8660254037844386467637231707529%s", LFending);
 
 1417            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 1427            if (!strcmp(floatType, 
"float")) {
 
 1428                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 4.0 / 3.0, LFending);
 
 1431                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 4.0 / 3.0, LFending);
 
 1436            if (!strcmp(floatType, 
"double")) {
 
 1437                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 4.0 / 3.0, LFending);
 
 1447            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId+%" PRIu64 
"];\n", w, stageSize);
 
 1457            if (!strcmp(floatType, 
"float")) {
 
 1458                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 / 3.0, LFending);
 
 1461                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 / 3.0, LFending);
 
 1466            if (!strcmp(floatType, 
"double")) {
 
 1467                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=sincos_20(angle*%.17f%s);\n", w, 2.0 / 3.0, LFending);
 
 1523        for (uint64_t i = 0; i < 2; i++) {
 
 1542            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 1552            if (!strcmp(floatType, 
"float")) {
 
 1553                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle);\n", w, cosDef);
 
 1556                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle);\n", w, sinDef);
 
 1560            if (!strcmp(floatType, 
"double")) {
 
 1561                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle);\n", w);
 
 1589            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=twiddleLUT[LUTId+%" PRIu64 
"];\n", w, stageSize);
 
 1599            if (!strcmp(floatType, 
"float")) {
 
 1600                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
 
 1603                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
 
 1607            if (!strcmp(floatType, 
"double")) {
 
 1608                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
 
 1624        if (stageAngle < 0) {
 
 1681        for (uint64_t i = 0; i < 5; i++) {
 
 1682            tf[i] = (
char*)malloc(
sizeof(
char) * 50);
 
 1684                for (uint64_t j = 0; j < i; j++) {
 
 1691        sprintf(tf[0], 
"-0.5%s", LFending);
 
 1692        sprintf(tf[1], 
"1.538841768587626701285145288018455%s", LFending);
 
 1693        sprintf(tf[2], 
"-0.363271264002680442947733378740309%s", LFending);
 
 1694        sprintf(tf[3], 
"-0.809016994374947424102293417182819%s", LFending);
 
 1695        sprintf(tf[4], 
"-0.587785252292473129168705954639073%s", LFending);
 
 1706        for (uint64_t i = radix - 1; i > 0; i--) {
 
 1707            if (i == radix - 1) {
 
 1709                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 1719                    if (!strcmp(floatType, 
"float")) {
 
 1720                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 1723                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 1728                    if (!strcmp(floatType, 
"double")) {
 
 1729                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 1737                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId+%" PRIu64 
"];\n", w, (radix - 1 - i) * stageSize);
 
 1747                    if (!strcmp(floatType, 
"float")) {
 
 1748                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 1751                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 1756                    if (!strcmp(floatType, 
"double")) {
 
 1757                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 1870        for (uint64_t i = 0; i < 5; i++) {
 
 1887        for (uint64_t i = 0; i < 8; i++) {
 
 1888            tf[i] = (
char*)malloc(
sizeof(
char) * 50);
 
 1890                for (uint64_t j = 0; j < i; j++) {
 
 1897        sprintf(tf[0], 
"-1.16666666666666651863693004997913%s", LFending);
 
 1898        sprintf(tf[1], 
"0.79015646852540022404554065360571%s", LFending);
 
 1899        sprintf(tf[2], 
"0.05585426728964774240049351305970%s", LFending);
 
 1900        sprintf(tf[3], 
"0.73430220123575240531721419756650%s", LFending);
 
 1901        if (stageAngle < 0) {
 
 1902            sprintf(tf[4], 
"0.44095855184409837868031445395900%s", LFending);
 
 1903            sprintf(tf[5], 
"0.34087293062393136944265847887436%s", LFending);
 
 1904            sprintf(tf[6], 
"-0.53396936033772524066165487965918%s", LFending);
 
 1905            sprintf(tf[7], 
"0.87484229096165666561546458979137%s", LFending);
 
 1908            sprintf(tf[4], 
"-0.44095855184409837868031445395900%s", LFending);
 
 1909            sprintf(tf[5], 
"-0.34087293062393136944265847887436%s", LFending);
 
 1910            sprintf(tf[6], 
"0.53396936033772524066165487965918%s", LFending);
 
 1911            sprintf(tf[7], 
"-0.87484229096165666561546458979137%s", LFending);
 
 1920        for (uint64_t i = radix - 1; i > 0; i--) {
 
 1921            if (i == radix - 1) {
 
 1923                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 1933                    if (!strcmp(floatType, 
"float")) {
 
 1934                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 1937                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 1942                    if (!strcmp(floatType, 
"double")) {
 
 1943                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 1951                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId+%" PRIu64 
"];\n\n", w, (radix - 1 - i) * stageSize);
 
 1961                    if (!strcmp(floatType, 
"float")) {
 
 1962                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 1965                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 1970                    if (!strcmp(floatType, 
"double")) {
 
 1971                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 2141        for (uint64_t i = 0; i < 8; i++) {
 
 2161            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 2171            if (!strcmp(floatType, 
"float")) {
 
 2172                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle);\n", w, cosDef);
 
 2175                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle);\n", w, sinDef);
 
 2179            if (!strcmp(floatType, 
"double")) {
 
 2180                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle);\n", w);
 
 2185        for (uint64_t i = 0; i < 4; i++) {
 
 2199            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=twiddleLUT[LUTId+%" PRIu64 
"];\n\n", w, stageSize);
 
 2209            if (!strcmp(floatType, 
"float")) {
 
 2210                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
 
 2213                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
 
 2217            if (!strcmp(floatType, 
"double")) {
 
 2218                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
 
 2223        for (uint64_t i = 0; i < 2; i++) {
 
 2236        if (stageAngle < 0) {
 
 2255        for (uint64_t i = 4; i < 6; i++) {
 
 2270            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=twiddleLUT[LUTId+%" PRIu64 
"];\n\n", w, 2 * stageSize);
 
 2280            if (!strcmp(floatType, 
"float")) {
 
 2281                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
 
 2284                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
 
 2289            if (!strcmp(floatType, 
"double")) {
 
 2290                sc->
tempLen = sprintf(sc->
tempStr, 
"  %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
 
 2306        if (stageAngle < 0) {
 
 2335        if (stageAngle < 0) {
 
 2336            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
 
 2339            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
 
 2344            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
 
 2347            sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
 
 2362        if (stageAngle < 0) {
 
 2420        for (uint64_t i = 0; i < 20; i++) {
 
 2421            tf[i] = (
char*)malloc(
sizeof(
char) * 50);
 
 2423                for (uint64_t j = 0; j < i; j++) {
 
 2432        sprintf(tf[0], 
"-1.100000000000000%s", LFending);
 
 2434        sprintf(tf[2], 
"0.253097611605959%s", LFending);
 
 2435        sprintf(tf[3], 
"-1.288200610773679%s", LFending);
 
 2436        sprintf(tf[4], 
"0.304632239669212%s", LFending);
 
 2437        sprintf(tf[5], 
"-0.391339615511917%s", LFending);
 
 2438        sprintf(tf[6], 
"-2.871022253392850%s", LFending);
 
 2439        sprintf(tf[7], 
"1.374907986616384%s", LFending);
 
 2440        sprintf(tf[8], 
"0.817178135341212%s", LFending);
 
 2441        sprintf(tf[9], 
"1.800746506445679%s", LFending);
 
 2442        sprintf(tf[10], 
"-0.859492973614498%s", LFending);
 
 2444        if (stageAngle < 0) {
 
 2445            sprintf(tf[1], 
"0.331662479035540%s", LFending);
 
 2446            sprintf(tf[11], 
"-2.373470454748280%s", LFending);
 
 2447            sprintf(tf[12], 
"-0.024836393087493%s", LFending);
 
 2448            sprintf(tf[13], 
"0.474017017512829%s", LFending);
 
 2449            sprintf(tf[14], 
"0.742183927770612%s", LFending);
 
 2450            sprintf(tf[15], 
"1.406473309094609%s", LFending);
 
 2451            sprintf(tf[16], 
"-1.191364552195948%s", LFending);
 
 2452            sprintf(tf[17], 
"0.708088885039503%s", LFending);
 
 2453            sprintf(tf[18], 
"0.258908260614168%s", LFending);
 
 2454            sprintf(tf[19], 
"-0.049929922194110%s", LFending);
 
 2457            sprintf(tf[1], 
"-0.331662479035540%s", LFending);
 
 2458            sprintf(tf[11], 
"2.373470454748280%s", LFending);
 
 2459            sprintf(tf[12], 
"0.024836393087493%s", LFending);
 
 2460            sprintf(tf[13], 
"-0.474017017512829%s", LFending);
 
 2461            sprintf(tf[14], 
"-0.742183927770612%s", LFending);
 
 2462            sprintf(tf[15], 
"-1.406473309094609%s", LFending);
 
 2463            sprintf(tf[16], 
"1.191364552195948%s", LFending);
 
 2464            sprintf(tf[17], 
"-0.708088885039503%s", LFending);
 
 2465            sprintf(tf[18], 
"-0.258908260614168%s", LFending);
 
 2466            sprintf(tf[19], 
"0.049929922194110%s", LFending);
 
 2468        for (uint64_t i = radix - 1; i > 0; i--) {
 
 2469            if (i == radix - 1) {
 
 2471                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 2481                    if (!strcmp(floatType, 
"float")) {
 
 2482                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 2485                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 2490                    if (!strcmp(floatType, 
"double")) {
 
 2491                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 2499                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId+%" PRIu64 
"];\n\n", w, (radix - 1 - i) * stageSize);
 
 2509                    if (!strcmp(floatType, 
"float")) {
 
 2510                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 2513                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 2518                    if (!strcmp(floatType, 
"double")) {
 
 2519                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 2530        uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
 
 2533        for (uint64_t i = 0; i < 5; i++) {
 
 2541        for (uint64_t i = 0; i < 4; i++) {
 
 2549        for (uint64_t i = 0; i < 4; i++) {
 
 2562        for (uint64_t k = 0; k < 2; k++) {
 
 2630            res = 
VkAddComplex(sc, regID[k * 4 + 3], sc->
locID[k * 4 + 3], regID[k * 4 + 5]);
 
 2632            res = 
VkAddComplex(sc, regID[k * 4 + 4], sc->
locID[k * 4 + 4], regID[k * 4 + 5]);
 
 2635            res = 
VkAddComplex(sc, regID[k * 4 + 5], sc->
locID[k * 4 + 5], regID[k * 4 + 6]);
 
 2637            res = 
VkAddComplex(sc, regID[k * 4 + 6], sc->
locID[k * 4 + 6], regID[k * 4 + 6]);
 
 2646        for (uint64_t i = 0; i < 4; i++) {
 
 2654        for (uint64_t i = 0; i < 4; i++) {
 
 2660        for (uint64_t i = 0; i < 5; i++) {
 
 2666        uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
 
 2667        res = 
VkPermute(sc, permute2, 11, 1, regID);
 
 2670        for (uint64_t i = 0; i < 20; i++) {
 
 2682        for (uint64_t i = 0; i < 20; i++) {
 
 2683            tf[i] = (
char*)malloc(
sizeof(
char) * 50);
 
 2685                for (uint64_t j = 0; j < i; j++) {
 
 2694        sprintf(tf[0], 
"-1.083333333333333%s", LFending);
 
 2695        sprintf(tf[1], 
"-0.300462606288666%s", LFending);
 
 2696        sprintf(tf[5], 
"1.007074065727533%s", LFending);
 
 2697        sprintf(tf[6], 
"0.731245990975348%s", LFending);
 
 2698        sprintf(tf[7], 
"-0.579440018900960%s", LFending);
 
 2699        sprintf(tf[8], 
"0.531932498429674%s", LFending);
 
 2700        sprintf(tf[9], 
"-0.508814921720398%s", LFending);
 
 2701        sprintf(tf[10], 
"-0.007705858903092%s", LFending);
 
 2703        if (stageAngle < 0) {
 
 2704            sprintf(tf[2], 
"-0.749279330626139%s", LFending);
 
 2705            sprintf(tf[3], 
"0.401002128321867%s", LFending);
 
 2706            sprintf(tf[4], 
"0.174138601152136%s", LFending);
 
 2707            sprintf(tf[11], 
"-2.511393318389568%s", LFending);
 
 2708            sprintf(tf[12], 
"-1.823546408682421%s", LFending);
 
 2709            sprintf(tf[13], 
"1.444979909023996%s", LFending);
 
 2710            sprintf(tf[14], 
"-1.344056915177370%s", LFending);
 
 2711            sprintf(tf[15], 
"-0.975932420775946%s", LFending);
 
 2712            sprintf(tf[16], 
"0.773329778651105%s", LFending);
 
 2713            sprintf(tf[17], 
"1.927725116783469%s", LFending);
 
 2714            sprintf(tf[18], 
"1.399739414729183%s", LFending);
 
 2715            sprintf(tf[19], 
"-1.109154843837551%s", LFending);
 
 2718            sprintf(tf[2], 
"0.749279330626139%s", LFending);
 
 2719            sprintf(tf[3], 
"-0.401002128321867%s", LFending);
 
 2720            sprintf(tf[4], 
"-0.174138601152136%s", LFending);
 
 2721            sprintf(tf[11], 
"2.511393318389568%s", LFending);
 
 2722            sprintf(tf[12], 
"1.823546408682421%s", LFending);
 
 2723            sprintf(tf[13], 
"-1.444979909023996%s", LFending);
 
 2724            sprintf(tf[14], 
"1.344056915177370%s", LFending);
 
 2725            sprintf(tf[15], 
"0.975932420775946%s", LFending);
 
 2726            sprintf(tf[16], 
"-0.773329778651105%s", LFending);
 
 2727            sprintf(tf[17], 
"-1.927725116783469%s", LFending);
 
 2728            sprintf(tf[18], 
"-1.399739414729183%s", LFending);
 
 2729            sprintf(tf[19], 
"1.109154843837551%s", LFending);
 
 2731        for (uint64_t i = radix - 1; i > 0; i--) {
 
 2732            if (i == radix - 1) {
 
 2734                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId];\n", w);
 
 2744                    if (!strcmp(floatType, 
"float")) {
 
 2745                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 2748                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 2753                    if (!strcmp(floatType, 
"double")) {
 
 2754                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 2762                    sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = twiddleLUT[LUTId+%" PRIu64 
"];\n\n", w, (radix - 1 - i) * stageSize);
 
 2772                    if (!strcmp(floatType, 
"float")) {
 
 2773                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
 
 2776                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
 
 2781                    if (!strcmp(floatType, 
"double")) {
 
 2782                        sc->
tempLen = sprintf(sc->
tempStr, 
"  %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
 
 2794        uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
 
 2797        for (uint64_t i = 0; i < 6; i++) {
 
 2803        for (uint64_t i = 0; i < 3; i++) {
 
 2809        for (uint64_t i = 0; i < 4; i++) {
 
 2812            res = 
VkSubComplex(sc, sc->
locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
 
 2816            res = 
VkSubComplex(sc, sc->
locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
 
 2826        for (uint64_t k = 0; k < 3; k++) {
 
 2846            res = 
VkAddComplex(sc, regID[k * 2 + 3], sc->
locID[k * 2 + 3], regID[k * 2 + 4]);
 
 2848            res = 
VkAddComplex(sc, regID[k * 2 + 4], sc->
locID[k * 2 + 4], regID[k * 2 + 4]);
 
 2907        for (uint64_t i = 0; i < 4; i++) {
 
 2917        for (uint64_t i = 0; i < 3; i++) {
 
 2925        for (uint64_t i = 0; i < 6; i++) {
 
 2931        uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 };
 
 2932        res = 
VkPermute(sc, permute2, 13, 1, regID);
 
 2935        for (uint64_t i = 0; i < 20; i++) {
 
 
 3855    double double_PI = 3.1415926535897932384626433832795;
 
 3857    char inputsStruct[20] = 
"";
 
 3858    char LFending[4] = 
"";
 
 3859    if (!strcmp(floatType, 
"float")) sprintf(LFending, 
"f");
 
 3860#if(VKFFT_BACKEND==0) 
 3862        sprintf(inputsStruct, 
"inputs");
 
 3864        sprintf(inputsStruct, 
".inputs");
 
 3865    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"vec2");
 
 3866    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"dvec2");
 
 3867    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"LF");
 
 3868    char cosDef[20] = 
"cos";
 
 3869    char sinDef[20] = 
"sin";
 
 3870#elif(VKFFT_BACKEND==1) 
 3871    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 3872    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 3873    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
 3874    sprintf(inputsStruct, 
"inputs");
 
 3875    char cosDef[20] = 
"__cosf";
 
 3876    char sinDef[20] = 
"__sinf";
 
 3877#elif(VKFFT_BACKEND==2) 
 3878    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 3879    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 3880    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
 3881    sprintf(inputsStruct, 
"inputs");
 
 3882    char cosDef[20] = 
"__cosf";
 
 3883    char sinDef[20] = 
"__sinf";
 
 3884#elif(VKFFT_BACKEND==3) 
 3885    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
 3886    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
 3887    sprintf(inputsStruct, 
"inputs");
 
 3888    char cosDef[20] = 
"native_cos";
 
 3889    char sinDef[20] = 
"native_sin";
 
 3891    char convTypeLeft[20] = 
"";
 
 3892    char convTypeRight[20] = 
"";
 
 3893    if ((!strcmp(floatType, 
"float")) && (strcmp(floatTypeMemory, 
"float"))) {
 
 3894        if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
 
 3895#if(VKFFT_BACKEND==0) 
 3896            sprintf(convTypeLeft, 
"float(");
 
 3897            sprintf(convTypeRight, 
")");
 
 3898#elif(VKFFT_BACKEND==1) 
 3899            sprintf(convTypeLeft, 
"(float)");
 
 3901#elif(VKFFT_BACKEND==2) 
 3902            sprintf(convTypeLeft, 
"(float)");
 
 3904#elif(VKFFT_BACKEND==3) 
 3905            sprintf(convTypeLeft, 
"(float)");
 
 3910#if(VKFFT_BACKEND==0) 
 3911            sprintf(convTypeLeft, 
"vec2(");
 
 3912            sprintf(convTypeRight, 
")");
 
 3913#elif(VKFFT_BACKEND==1) 
 3914            sprintf(convTypeLeft, 
"conv_float2(");
 
 3915            sprintf(convTypeRight, 
")");
 
 3916#elif(VKFFT_BACKEND==2) 
 3917            sprintf(convTypeLeft, 
"conv_float2(");
 
 3918            sprintf(convTypeRight, 
")");
 
 3919#elif(VKFFT_BACKEND==3) 
 3920            sprintf(convTypeLeft, 
"conv_float2(");
 
 3921            sprintf(convTypeRight, 
")");
 
 3925    if ((!strcmp(floatType, 
"double")) && (strcmp(floatTypeMemory, 
"double"))) {
 
 3926        if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
 
 3927#if(VKFFT_BACKEND==0) 
 3928            sprintf(convTypeLeft, 
"double(");
 
 3929            sprintf(convTypeRight, 
")");
 
 3930#elif(VKFFT_BACKEND==1) 
 3931            sprintf(convTypeLeft, 
"(double)");
 
 3933#elif(VKFFT_BACKEND==2) 
 3934            sprintf(convTypeLeft, 
"(double)");
 
 3936#elif(VKFFT_BACKEND==3) 
 3937            sprintf(convTypeLeft, 
"(double)");
 
 3942#if(VKFFT_BACKEND==0) 
 3943            sprintf(convTypeLeft, 
"dvec2(");
 
 3944            sprintf(convTypeRight, 
")");
 
 3945#elif(VKFFT_BACKEND==1) 
 3946            sprintf(convTypeLeft, 
"conv_double2(");
 
 3947            sprintf(convTypeRight, 
")");
 
 3948#elif(VKFFT_BACKEND==2) 
 3949            sprintf(convTypeLeft, 
"conv_double2(");
 
 3950            sprintf(convTypeRight, 
")");
 
 3951#elif(VKFFT_BACKEND==3) 
 3952            sprintf(convTypeLeft, 
"conv_double2(");
 
 3953            sprintf(convTypeRight, 
")");
 
 3957    char index_x[2000] = 
"";
 
 3958    char index_y[2000] = 
"";
 
 3959    char requestCoordinate[100] = 
"";
 
 3962            sprintf(requestCoordinate, 
"coordinate");
 
 3965    char requestBatch[100] = 
"";
 
 3968            sprintf(requestBatch, 
"0");
 
 3976        char shiftX[500] = 
"";
 
 3978            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 3979        char shiftY[500] = 
"";
 
 3988        char shiftY2[100] = 
"";
 
 3990            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 3998                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 4005                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 4076                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 4078                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")] = %sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4082                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 4084                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride] = %sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4102                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4105                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4110                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4113                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4133                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4136                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4141                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4144                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4189                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
 
 4194                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = %s+%" PRIu64 
"+%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
");\n", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
 
 4225                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID / %" PRIu64 
") + sharedStride*(combinedID %% %" PRIu64 
")] = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
 
 4253                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID / %" PRIu64 
") + sharedStride*(combinedID %% %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4256                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID / %" PRIu64 
") + sharedStride*(combinedID %% %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4283        char shiftX[500] = 
"";
 
 4287        sprintf(sc->
disableThreadsStart, 
"       if (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
 
 4293                sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" * (%s + %" PRIu64 
") + ((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
 
 4329                        sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[%s*(%s+%" PRIu64 
")+%s]=%sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4386        char shiftX[500] = 
"";
 
 4397                sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s%s) %% (%" PRIu64 
") + %" PRIu64 
" * (%s + %" PRIu64 
") + ((%s%s) / %" PRIu64 
") * (%" PRIu64 
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
 
 4432                        sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[%s*(%s+%" PRIu64 
")+%s]=%sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4489        char shiftX[500] = 
"";
 
 4491            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 4492        char shiftY[500] = 
"";
 
 4494            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 4514                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 4521                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 4558                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = %sinputBlocks[(%s + %" PRIu64 
")/ %" PRIu64 
"]%s[(%s + %" PRIu64 
") %% %" PRIu64 
"]%s;\n", sc->
regIDs[i + k * sc->
registers_per_thread], convTypeLeft, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, convTypeRight);
 
 4575                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 4577                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = %sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4586                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
 
 4594                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride+ (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4596                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4603                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
 
 4613                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
 
 4621                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4623                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4643                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4646                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4651                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4654                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4676                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4679                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4684                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4687                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 4699                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 4706                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 4722        char shiftX[500] = 
"";
 
 4724            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 4725        char shiftY[500] = 
"";
 
 4728        char shiftY2[100] = 
"";
 
 4730            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 4733                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 4735                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 4753                for (uint64_t i = 0; i < num_in; i++) {
 
 4764                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * %" PRIu64 
";\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, sc->
inputStride[1]);
 
 4817                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 4819                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride] = %sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4825                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 4827                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")] = %sinputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
 
 4846                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
 
 4849                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
 
 4854                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
 
 4857                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
 
 4910                                if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
 
 4982                                if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
 
 5056                                if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
 
 5128                                if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
 
 5207        char shiftX[500] = 
"";
 
 5209            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 5210        char shiftY[500] = 
"";
 
 5212            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 5225                for (uint64_t i = 0; i < num_in; i++) {
 
 5240                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5257                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 5289                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 5294                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 5306                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5320                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim-1);
 
 5323                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", 2*sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
 
 5326                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5334                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
")  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
 
 5338                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5348                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5362                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
 
 5365                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
 
 5368                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5388                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 5393                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
")  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
 
 5405                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
 
 5408                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
 
 5411                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5419                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
 
 5422                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
 
 5425                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5442                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5454                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 5472        char shiftX[500] = 
"";
 
 5474            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 5475        char shiftX2[500] = 
"";
 
 5478        char shiftY[500] = 
"";
 
 5480            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 5491            uint64_t num_in = (uint64_t)ceil((sc->
fftDim) / (
double)sc->
localSize[1]);
 
 5493                for (uint64_t i = 0; i < num_in; i++) {
 
 5499                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5521                        sc->
tempLen = sprintf(sc->
tempStr, 
"      //sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (%s + ((%s + %" PRIu64 
") %% %" PRIu64 
") * %" PRIu64 
") / %" PRIu64 
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
 
 5539                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 5552                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 5583                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
 
 5589                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5627                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")>0)&&((combinedID %% %" PRIu64 
") < %" PRIu64 
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
 
 5633                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[inoutID] = sdata[sdataID];\n");
 
 5653                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5670        char shiftX[500] = 
"";
 
 5672            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 5673        char shiftY[500] = 
"";
 
 5675            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 5701                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5713                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 5740                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 5745                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 5757                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5773                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2))  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 5777                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5787                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 5815                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 5820                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2))  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 5836                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5843                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 5860        char shiftX[500] = 
"";
 
 5862            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 5863        char shiftX2[500] = 
"";
 
 5866        char shiftY[500] = 
"";
 
 5868            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 5885                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 5902                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (%s + ((%s + %" PRIu64 
") %% %" PRIu64 
") * %" PRIu64 
") / %" PRIu64 
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
 
 5920                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 5933                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6005                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 6021        char shiftX[500] = 
"";
 
 6023            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 6024        char shiftY[500] = 
"";
 
 6026            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 6038                for (uint64_t i = 0; i < num_in; i++) {
 
 6095                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
 6098                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
 6108                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6120                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, convTypeRight);
 
 6147                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 6152                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 6156                    sc->
tempLen = sprintf(sc->
tempStr, 
"          if (combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
 6177                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6189                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, convTypeRight);
 
 6221                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 6226                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 6295        char shiftX[500] = 
"";
 
 6297            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 6298        char shiftX2[500] = 
"";
 
 6301        char shiftY[500] = 
"";
 
 6303            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 6305        uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
 
 6316                for (uint64_t i = 0; i < num_in; i++) {
 
 6322                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 6353                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 6367                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6398                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (combinedID) );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending);
 
 6401                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (combinedID) );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending);
 
 6432                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 6443                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6502                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 6524        char shiftX[500] = 
"";
 
 6526            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 6527        char shiftY[500] = 
"";
 
 6536        char shiftY2[100] = 
"";
 
 6538            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 6544                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 6551                sprintf(sc->
disableThreadsStart, 
"       if(%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
 
 6621                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6630                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[2*(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6633                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(2*(combinedID %% %" PRIu64 
")+1) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6636                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6639                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6645                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6648                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6651                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" + 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6654                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" + 2*(combinedID %% %" PRIu64 
")) * sharedStride + (combinedID / %" PRIu64 
")] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6659                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[2*(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6662                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(2*(combinedID %% %" PRIu64 
")+1) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6665                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6668                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6674                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6677                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" - 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6680                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" + 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
 
 6683                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[(%" PRIu64 
" + 2*(combinedID %% %" PRIu64 
")) + (combinedID / %" PRIu64 
") * sharedStride] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
 
 6700                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 6703                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 6708                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
 
 6711                                sc->
tempLen = sprintf(sc->
tempStr, 
"          sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
 
 6845        char shiftX[500] = 
"";
 
 6849            sprintf(sc->
disableThreadsStart, 
"       if (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
 
 6873                    sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" * (%s + %" PRIu64 
") + ((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
 
 6896                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 6967        char shiftX[500] = 
"";
 
 6969            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 6970        char shiftY[500] = 
"";
 
 6972            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 6981            uint64_t maxBluesteinCutOff = 1;
 
 7019                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7038#if(VKFFT_BACKEND!=3) 
 7040                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 7046                    if (i < sc->min_registers_per_thread) {
 
 7063#if(VKFFT_BACKEND!=3) 
 7066                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7070                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7075                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7103                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7107                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7111                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7120                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 1) {\n", 2 * sc->
fftDim);
 
 7154#if(VKFFT_BACKEND==3) 
 7186                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7207                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7211                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7215                    if (i < sc->min_registers_per_thread) {
 
 7216                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7227                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7246                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7250                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7254                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7263                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 1) {\n", 2 * sc->
fftDim);
 
 7330                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7351                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7355                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7359                    if (i < sc->min_registers_per_thread) {
 
 7360                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 1) {\n", 2 * sc->
fftDim);
 
 7371                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 1) {\n", 2 * sc->
fftDim);
 
 7390                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2) * sharedStride + (combinedID / %" PRIu64 
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7394                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID %% %" PRIu64 
")/2)  + (combinedID / %" PRIu64 
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 7398                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 0) {\n", 2 * sc->
fftDim);
 
 7407                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((combinedID %% %" PRIu64 
")%%2) == 1) {\n", 2 * sc->
fftDim);
 
 7463                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7468                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 7471                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
")  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
 
 7479                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
 
 7482                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = sdata[sdataID-1].y;\n", sc->
w);
 
 7486                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = sdata[sdataID].x;\n", sc->
w);
 
 7502                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim - 1, sc->
fftDim);
 
 7505                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
")  + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim - 1, sc->
fftDim);
 
 7542                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7547                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 7550                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
 
 7560#if(VKFFT_BACKEND!=3) 
 7562                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 7565                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 7596#if(VKFFT_BACKEND==3) 
 7609                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
 7617                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 7620                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
 7650                for (uint64_t i = 0; i < num_in; i++) {
 
 7678                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
 7681                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
 7687                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 7692                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 7700                    sc->
tempLen = sprintf(sc->
tempStr, 
"          if (combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
 7705                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 7710                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
 7736                    sc->
tempLen = sprintf(sc->
tempStr, 
"          if (combinedID %% %" PRIu64 
" == 0){\n", sc->
fftDim / 2 + 1);
 
 7775        char shiftX[500] = 
"";
 
 7777            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 7778        char shiftX2[500] = 
"";
 
 7781        char shiftY[500] = 
"";
 
 7783            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 7816                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 7829#if(VKFFT_BACKEND!=3) 
 7831                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 7837                    if (i < sc->min_registers_per_thread) {
 
 7854#if(VKFFT_BACKEND!=3) 
 7855                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 7882                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 7916#if(VKFFT_BACKEND==3) 
 7941                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 7954                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 7958                    if (i < sc->min_registers_per_thread) {
 
 7987                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8047                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 8060                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8064                    if (i < sc->min_registers_per_thread) {
 
 8093                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = ((combinedID / %" PRIu64 
")/2) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8146                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8153                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
 
 8157                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = sdata[sdataID].x;\n", sc->
w);
 
 8172                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
fftDim - 1, sc->
localSize[0]);
 
 8212                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8222#if(VKFFT_BACKEND!=3) 
 8223                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
 8251#if(VKFFT_BACKEND==3) 
 8271                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (%" PRIu64 
" - combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
 8295            uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
 
 8298                for (uint64_t i = 0; i < num_in; i++) {
 
 8318                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (combinedID / %" PRIu64 
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
 
 8321                        sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (combinedID / %" PRIu64 
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
 
 8326                    sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
localSize[0]);
 
 8338                    sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%" PRIu64 
" - combinedID / %" PRIu64 
") * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
 8391        char shiftX[500] = 
"";
 
 8393            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 8394        char shiftY[500] = 
"";
 
 8396            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 8422                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 8434                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 8461                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 8466                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 8478                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 8494                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
 
 8498                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 8508                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
 
 8536                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
 8541                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
 
 8557                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
 8564                        if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
 8596                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = %" PRIu64 
" + 4 * (combinedID %% %" PRIu64 
");\n", sc->
fftDim / 2, sc->
fftDim);
 
 8600                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID < %" PRIu64 
") sdataID = inoutID;\n", sc->
fftDim);
 
 8603                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
 
 8606                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = inoutID - %" PRIu64 
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 8609                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
 
 8612                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID >= %" PRIu64 
") sdataID = inoutID - %" PRIu64 
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
 
 8621                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 8626                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 8655                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = %" PRIu64 
" + 4 * combinedID;\n", sc->
fftDim / 2);
 
 8659                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID < %" PRIu64 
") sdataID = inoutID;\n", sc->
fftDim);
 
 8662                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
 
 8665                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = inoutID - %" PRIu64 
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 8668                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
 
 8671                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID >= %" PRIu64 
") sdataID = inoutID - %" PRIu64 
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
 
 8680                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 8685                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 8709        char shiftX[500] = 
"";
 
 8711            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
 8712        char shiftX2[500] = 
"";
 
 8715        char shiftY[500] = 
"";
 
 8717            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
 8734                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 8766                    res = 
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 
 8779                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
 
 8810                    if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
 
 8840                    sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = %" PRIu64 
" + 4 * combinedID;\n", sc->
fftDim / 2);
 
 8844                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID < %" PRIu64 
") sdataID = inoutID;\n", sc->
fftDim);
 
 8847                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
 
 8850                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = inoutID - %" PRIu64 
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
 
 8853                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")) sdataID = %" PRIu64 
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
 
 8856                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (inoutID >= %" PRIu64 
") sdataID = inoutID - %" PRIu64 
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
 
 8865                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 8870                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((inoutID < %" PRIu64 
")&&(inoutID >= %" PRIu64 
")){ \n\ 
 
11538    double double_PI = 3.1415926535897932384626433832795;
 
11540    char outputsStruct[20] = 
"";
 
11541    char LFending[4] = 
"";
 
11542    if (!strcmp(floatType, 
"float")) sprintf(LFending, 
"f");
 
11543#if(VKFFT_BACKEND==0) 
11544    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"vec2");
 
11545    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"dvec2");
 
11547        sprintf(outputsStruct, 
"outputs");
 
11549        sprintf(outputsStruct, 
".outputs");
 
11550    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"LF");
 
11551    char cosDef[20] = 
"cos";
 
11552    char sinDef[20] = 
"sin";
 
11553#elif(VKFFT_BACKEND==1) 
11554    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
11555    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
11556    sprintf(outputsStruct, 
"outputs");
 
11557    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
11558    char cosDef[20] = 
"__cosf";
 
11559    char sinDef[20] = 
"__sinf";
 
11560#elif(VKFFT_BACKEND==2) 
11561    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
11562    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
11563    sprintf(outputsStruct, 
"outputs");
 
11564    if (!strcmp(floatType, 
"double")) sprintf(LFending, 
"l");
 
11565    char cosDef[20] = 
"__cosf";
 
11566    char sinDef[20] = 
"__sinf";
 
11567#elif(VKFFT_BACKEND==3) 
11568    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
11569    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
11570    sprintf(outputsStruct, 
"outputs");
 
11572    char cosDef[20] = 
"native_cos";
 
11573    char sinDef[20] = 
"native_sin";
 
11575    char convTypeLeft[20] = 
"";
 
11576    char convTypeRight[20] = 
"";
 
11577    if ((!strcmp(floatTypeMemory, 
"half")) && (strcmp(floatType, 
"half"))) {
 
11578        if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
 
11579            sprintf(convTypeLeft, 
"float16_t(");
 
11580            sprintf(convTypeRight, 
")");
 
11583            sprintf(convTypeLeft, 
"f16vec2(");
 
11584            sprintf(convTypeRight, 
")");
 
11587    if ((!strcmp(floatTypeMemory, 
"float")) && (strcmp(floatType, 
"float"))) {
 
11588        if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
 
11589#if(VKFFT_BACKEND==0) 
11590            sprintf(convTypeLeft, 
"float(");
 
11591            sprintf(convTypeRight, 
")");
 
11592#elif(VKFFT_BACKEND==1) 
11593            sprintf(convTypeLeft, 
"(float)");
 
11595#elif(VKFFT_BACKEND==2) 
11596            sprintf(convTypeLeft, 
"(float)");
 
11598#elif(VKFFT_BACKEND==3) 
11599            sprintf(convTypeLeft, 
"(float)");
 
11604#if(VKFFT_BACKEND==0) 
11605            sprintf(convTypeLeft, 
"vec2(");
 
11606            sprintf(convTypeRight, 
")");
 
11607#elif(VKFFT_BACKEND==1) 
11608            sprintf(convTypeLeft, 
"conv_float2(");
 
11609            sprintf(convTypeRight, 
")");
 
11610#elif(VKFFT_BACKEND==2) 
11611            sprintf(convTypeLeft, 
"conv_float2(");
 
11612            sprintf(convTypeRight, 
")");
 
11613#elif(VKFFT_BACKEND==3) 
11614            sprintf(convTypeLeft, 
"conv_float2(");
 
11615            sprintf(convTypeRight, 
")");
 
11619    if ((!strcmp(floatTypeMemory, 
"double")) && (strcmp(floatType, 
"double"))) {
 
11620        if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
 
11621#if(VKFFT_BACKEND==0) 
11622            sprintf(convTypeLeft, 
"double(");
 
11623            sprintf(convTypeRight, 
")");
 
11624#elif(VKFFT_BACKEND==1) 
11625            sprintf(convTypeLeft, 
"(double)");
 
11627#elif(VKFFT_BACKEND==2) 
11628            sprintf(convTypeLeft, 
"(double)");
 
11630#elif(VKFFT_BACKEND==3) 
11631            sprintf(convTypeLeft, 
"(double)");
 
11636#if(VKFFT_BACKEND==0) 
11637            sprintf(convTypeLeft, 
"dvec2(");
 
11638            sprintf(convTypeRight, 
")");
 
11639#elif(VKFFT_BACKEND==1) 
11640            sprintf(convTypeLeft, 
"conv_double2(");
 
11641            sprintf(convTypeRight, 
")");
 
11642#elif(VKFFT_BACKEND==2) 
11643            sprintf(convTypeLeft, 
"conv_double2(");
 
11644            sprintf(convTypeRight, 
")");
 
11645#elif(VKFFT_BACKEND==3) 
11646            sprintf(convTypeLeft, 
"conv_double2(");
 
11647            sprintf(convTypeRight, 
")");
 
11652    char index_x[2000] = 
"";
 
11653    char index_y[2000] = 
"";
 
11654    char requestCoordinate[100] = 
"";
 
11657            sprintf(requestCoordinate, 
"coordinate");
 
11660    char requestBatch[100] = 
"";
 
11663            sprintf(requestBatch, 
"batchID");
 
11666    switch (writeType) {
 
11675        char shiftX[500] = 
"";
 
11677            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
11678        char shiftY[500] = 
"";
 
11688        char shiftY2[100] = 
"";
 
11690            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
11699                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s + %" PRIu64 
" * %s) %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" < %" PRIu64 
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
 
11709                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s + %" PRIu64 
" * %s) %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" < %" PRIu64 
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
 
11783                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11785                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11791                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11793                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11837                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = combinedID %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" + ((combinedID/%" PRIu64 
") * %" PRIu64 
")+ ((%s%s) %% %" PRIu64 
") * %" PRIu64 
";\n", sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
localSize[0], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
 
11845                                sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = combinedID %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" + ((combinedID/%" PRIu64 
") * %" PRIu64 
")+ ((%s%s) %% %" PRIu64 
") * %" PRIu64 
";\n", sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
 
11877                                    sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_x, sc->
gl_WorkGroupSize_x, convTypeRight);
 
11885                                    sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_y, sc->
gl_WorkGroupSize_y, convTypeRight);
 
11975                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11977                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11983                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
11985                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12033                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (combinedID %% %" PRIu64 
")+(combinedID / %" PRIu64 
") * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
");", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
 
12038                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = %s+%" PRIu64 
"+%s * %" PRIu64 
" + (((%s%s) %% %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") * %" PRIu64 
");", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
 
12108        char shiftX[500] = 
"";
 
12111        sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
 
12117                    sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s + %" PRIu64 
") * (%" PRIu64 
") + (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")) * (%" PRIu64 
") + ((%s%s) / %" PRIu64 
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
 
12149                            sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[%s*(%s+%" PRIu64 
") + %s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
 
12169                        sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s + %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
 
12178                            sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s + %" PRIu64 
") * %" PRIu64 
" + ((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
 
12190                    sprintf(index_y, 
"%" PRIu64 
" * (%s + %" PRIu64 
") + ((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
")", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
 
12191                    res = 
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 
12211                            sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[inoutID / %" PRIu64 
"]%s[inoutID %% %" PRIu64 
"] =  %ssdata[%s*(%s+%" PRIu64 
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
 
12244        char shiftX[500] = 
"";
 
12252                sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s%s) %% (%" PRIu64 
") + %" PRIu64 
" * (%s + %" PRIu64 
") + ((%s%s) / %" PRIu64 
") * (%" PRIu64 
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
 
12287                        sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[inoutID / %" PRIu64 
"]%s[inoutID %% %" PRIu64 
"] = %ssdata[%s*(%s+%" PRIu64 
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
 
12319        char shiftX[500] = 
"";
 
12321            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
12322        char shiftY[500] = 
"";
 
12326            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
12342        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
12357        sdata[%s * sharedStride + %" PRIu64 
"] = sdata[%s * sharedStride];\n\ 
12371                    for (uint64_t i = 0; i < num_out; i++) {
 
12391                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
12403                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
12440                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
 
12443                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12446                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12452                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12455                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12462                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12469                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
 
12472                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12475                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12481                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12484                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
 
12491                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12501                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
 
12509                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
 
12540                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
12547                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
12580        char shiftY[500] = 
"";
 
12582            sprintf(shiftY, 
" + consts.workGroupShiftY * %" PRIu64 
"", sc->
localSize[1]);
 
12613                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
12620                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
12649                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s%s.x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
 
12660                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s%s.y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
 
12670                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12672                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12680                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12682                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") * sharedStride+ (combinedID / %" PRIu64 
")].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12689                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12691                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12699                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12701                                        sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
 
12720                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
12727                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
12751        char shiftX[500] = 
"";
 
12753            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
12754        char shiftY[500] = 
"";
 
12757        char shiftY2[500] = 
"";
 
12759            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
12776        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
12791        sdata[%s * sharedStride + %" PRIu64 
"] = sdata[%s * sharedStride];\n\ 
12805                    for (uint64_t i = 0; i < num_out; i++) {
 
12864                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = (sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")]);\n", sc->
regIDs[0], sc->
fftDim, sc->
fftDim);
 
12868                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12877                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12887                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = (sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride]);\n", sc->
regIDs[0],  sc->
fftDim, sc->
fftDim);
 
12891                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12899                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
 
12911                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
 
12915                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
12927                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim);
 
12931                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
12994        char shiftX[500] = 
"";
 
12997        char shiftY[500] = 
"";
 
13000        char shiftY2[500] = 
"";
 
13002            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
13018        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
13029                    uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim) / (
double)sc->
localSize[1]);
 
13031                    for (uint64_t i = 0; i < num_out; i++) {
 
13065                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13068                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y-sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13072                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13074                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13079                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13082                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13089                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13100                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13102                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13108                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13116                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
 
13120                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13162        char shiftX[500] = 
"";
 
13164            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
13165        char shiftY[500] = 
"";
 
13168        char shiftY2[500] = 
"";
 
13170            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
13186        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
13201        sdata[%s * sharedStride + %" PRIu64 
"] = sdata[%s * sharedStride];\n\ 
13215                    for (uint64_t i = 0; i < num_out; i++) {
 
13283                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = 2*%s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
13286                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = 2*%s(%.17f%s * (combinedID %% %" PRIu64 
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
 
13293                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13296                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13300                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13302                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13307                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13310                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
")* sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13317                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13325                                sc->
tempLen = sprintf(sc->
tempStr, 
"          if(combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
13344                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13346                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13352                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13365                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13368                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13372                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13374                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13377                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13380                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"-combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
 
13387                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13395                                sc->
tempLen = sprintf(sc->
tempStr, 
"          if(combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
13414                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13416                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13422                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13437                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
13441                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13451                                sc->
tempLen = sprintf(sc->
tempStr, 
"          if(combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
13470                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13485                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
 
13489                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13499                                sc->
tempLen = sprintf(sc->
tempStr, 
"          if(combinedID %% %" PRIu64 
" > 0){\n", sc->
fftDim / 2 + 1);
 
13518                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13583        char shiftX[500] = 
"";
 
13586        char shiftY[500] = 
"";
 
13589        char shiftY2[500] = 
"";
 
13591            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
13606        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
13617                    uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
 
13619                    for (uint64_t i = 0; i < num_out; i++) {
 
13663                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = 2*%s(%.17f%s * (combinedID / %" PRIu64 
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
 
13666                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = 2*%s(%.17f%s * (combinedID / %" PRIu64 
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
 
13672                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13675                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y-sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13679                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13681                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13686                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13689                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
13696                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13707                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13709                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
13715                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
13723                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (combinedID %% %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
 
13727                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13756                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13800        char shiftX[500] = 
"";
 
13802            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
13803        char shiftY[500] = 
"";
 
13806        char shiftY2[500] = 
"";
 
13808            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
13818                uint64_t maxBluesteinCutOff = 1;
 
13835                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
13881                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
13886                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) + (combinedID / %" PRIu64 
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
13892                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
 
13901                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
 
13909                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) + (combinedID / %" PRIu64 
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
13913                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13920                                sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
13924                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
13976        char shiftX[500] = 
"";
 
13979        char shiftY[500] = 
"";
 
13982        char shiftY2[500] = 
"";
 
13984            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
13999        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
14048                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
14051                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y-sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
14055                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
14057                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
14062                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
14065                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x+sdata[(%" PRIu64 
"-combinedID / %" PRIu64 
")* sharedStride + (combinedID %% %" PRIu64 
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
 
14072                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
14083                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
14085                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
 
14091                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[(%s %" PRIu64 
")/ %" PRIu64 
"]%s[(%s+%" PRIu64 
") %% %" PRIu64 
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
 
14099                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID / %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID / %" PRIu64 
") %% 2)) * ((combinedID / %" PRIu64 
")/2)) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
 
14103                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
 
14149        char shiftX[500] = 
"";
 
14151            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
14152        char shiftY[500] = 
"";
 
14162        char shiftY2[100] = 
"";
 
14164            sprintf(shiftY, 
" + consts.workGroupShiftY ");
 
14173                    sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s + %" PRIu64 
" * %s) %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" < %" PRIu64 
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
 
14179                sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s + %" PRIu64 
" * %s) %% %" PRIu64 
" + ((%s%s) / %" PRIu64 
")*%" PRIu64 
" < %" PRIu64 
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
 
14252                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64 
")+1) * sharedStride + (combinedID / %" PRIu64 
")].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
 
14254                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[(2*(combinedID %% %" PRIu64 
")+1) * sharedStride + (combinedID / %" PRIu64 
")].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
 
14260                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[%s] = %ssdata[2*(combinedID %% %" PRIu64 
")+1 + (combinedID / %" PRIu64 
") * sharedStride].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
 
14262                                sc->
tempLen = sprintf(sc->
tempStr, 
"      outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[2*(combinedID %% %" PRIu64 
")+1 + (combinedID / %" PRIu64 
") * sharedStride].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
 
14549        char shiftX[500] = 
"";
 
14553            sc->
tempLen = sprintf(sc->
tempStr, 
"      if (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")+((%s%s) / %" PRIu64 
") * (%" PRIu64 
") < %" PRIu64 
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
 
14564                    sc->
tempLen = sprintf(sc->
tempStr, 
"      inoutID = (%s + %" PRIu64 
") * (%" PRIu64 
") + (((%s%s) / %" PRIu64 
") %% (%" PRIu64 
")) * (%" PRIu64 
") + ((%s%s) / %" PRIu64 
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
 
14587                    sc->
tempLen = sprintf(sc->
tempStr, 
"          outputBlocks[%s / %" PRIu64 
"]%s[%s %% %" PRIu64 
"] = %ssdata[%s*(2*(%s+%" PRIu64 
")+1) + %s].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
 
14662        char shiftX[500] = 
"";
 
14664            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
14665        char shiftY[500] = 
"";
 
14668        char shiftY2[500] = 
"";
 
14670            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
14679                uint64_t maxBluesteinCutOff = 1;
 
14696                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if(combinedID < %" PRIu64 
"){\n", maxBluesteinCutOff);
 
14701                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) * sharedStride + (combinedID / %" PRIu64 
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
14706                            sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID %% %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID %% %" PRIu64 
") %% 2)) * ((combinedID %% %" PRIu64 
")/2)) + (combinedID / %" PRIu64 
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
 
14760                        res = 
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
 
14772                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (2*(combinedID %% %" PRIu64 
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
 
14775                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (2*(combinedID %% %" PRIu64 
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
 
14795                        sc->
tempLen = sprintf(index_x, 
"%" PRIu64 
" - combinedID %% %" PRIu64 
" + ((combinedID/%" PRIu64 
") * %" PRIu64 
")", 2 * sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
outputStride[1]);
 
14799                        res = 
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
 
14867        char shiftX[500] = 
"";
 
14869            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
14870        char shiftX2[500] = 
"";
 
14873        char shiftY[500] = 
"";
 
14876        char shiftY2[500] = 
"";
 
14878            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
14901                        sc->
tempLen = sprintf(sc->
tempStr, 
"      sdataID = (((combinedID / %" PRIu64 
") %% 2) * %" PRIu64 
" + (1-2*((combinedID / %" PRIu64 
") %% 2)) * ((combinedID / %" PRIu64 
")/2)) * sharedStride + (combinedID %% %" PRIu64 
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
 
14946                        res = 
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 
14959                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.x = %s(%.17f%s * (2*(combinedID / %" PRIu64 
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
 
14962                            sc->
tempLen = sprintf(sc->
tempStr, 
"      mult.y = %s(%.17f%s * (2*(combinedID / %" PRIu64 
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
 
14987                        res = 
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 
15043        char shiftX[500] = 
"";
 
15045            sprintf(shiftX, 
" + consts.workGroupShiftX ");
 
15046        char shiftY[500] = 
"";
 
15049        char shiftY2[500] = 
"";
 
15051            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
15067        sdata[%s + %" PRIu64 
"* sharedStride] = sdata[%s];\n\ 
15082        sdata[%s * sharedStride + %" PRIu64 
"] = sdata[%s * sharedStride];\n\ 
15109                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
15121                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
15164                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15167                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, mult*sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15170                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(%" PRIu64 
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15176                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(%" PRIu64 
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15179                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15187                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15190                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"- (2*sdataID+1)) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15193                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(%" PRIu64 
"- (2*sdataID+1)) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15199                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(%" PRIu64 
"- (2*sdataID+1)) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15202                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
"- (2*sdataID+1)) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
 
15212                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64 
") * sharedStride];\n", sc->
regIDs[0], sc->
fftDim);
 
15214                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 
")];\n", sc->
regIDs[0], sc->
fftDim);
 
15218                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID + 1)/2) %% 2) != 0) \n\ 
15224                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15235                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if((sdataID < %" PRIu64 
")&&(sdataID >= %" PRIu64 
")){\n", sc->
fftDim/2, sc->
fftDim/4);
 
15240                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15243                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15246                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(%" PRIu64 
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15252                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(%" PRIu64 
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15255                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15263                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15266                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
" + 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15269                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(%" PRIu64 
" + 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15275                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(%" PRIu64 
" + 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15278                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
" + 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15288                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
 
15290                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
 
15294                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15300                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15311                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if((sdataID < %" PRIu64 
")&&(sdataID >= %" PRIu64 
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
 
15316                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15319                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15322                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15328                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15331                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15339                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15342                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15345                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15351                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15354                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
 
15364                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
 
15366                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
 
15370                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15376                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15392                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15395                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15398                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y-sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15404                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].y+sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15407                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")].x+sdata[(2*sdataID - %" PRIu64 
") * sharedStride + (combinedID / %" PRIu64 
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15415                                    sc->
tempLen = sprintf(sc->
tempStr, 
"if ( (combinedID / %" PRIu64 
") %% 2 == 0){\n", sc->
fftDim);
 
15418                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15421                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y-sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15427                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x = 0.5%s*(sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].y+sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15430                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.y = 0.5%s*(-sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 
") + (combinedID / %" PRIu64 
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
 
15440                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(%" PRIu64 
" - 2*sdataID) + (combinedID / %" PRIu64 
") * sharedStride];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
 
15442                                    sc->
tempLen = sprintf(sc->
tempStr, 
"      %s = sdata[(%" PRIu64 
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 
")];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
 
15446                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15452                            sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15461                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
 
15465                                sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
 
15494                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
 
15501                            if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
 
15578        char shiftX[500] = 
"";
 
15581        char shiftY[500] = 
"";
 
15584        char shiftY2[500] = 
"";
 
15586            sprintf(shiftY2, 
" + consts.workGroupShiftY ");
 
15638                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID + 1)/2) %% 2) != 0) \n\ 
15644                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15655                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if((sdataID < %" PRIu64 
")&&(sdataID >= %" PRIu64 
")){\n", sc->
fftDim / 2, sc->
fftDim / 4);
 
15661                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15667                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15678                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if((sdataID < %" PRIu64 
")&&(sdataID >= %" PRIu64 
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
 
15684                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15690                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15707                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID+1)/2) %% 2) != 0) \n\ 
15713                        sc->
tempLen = sprintf(sc->
tempStr, 
"      if ((((sdataID)/2) %% 2) != 0) \n\ 
15722                        sc->
tempLen = sprintf(sc->
tempStr, 
"      %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
 
15726                            sc->
tempLen = sprintf(sc->
tempStr, 
"      %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
 
 
16442    char vecTypeInput[30];
 
16443    char vecTypeOutput[30];
 
16444#if(VKFFT_BACKEND==0) 
16445    if (!strcmp(floatType, 
"half")) sprintf(vecType, 
"f16vec2");
 
16446    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"vec2");
 
16447    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"dvec2");
 
16448    if (!strcmp(floatTypeInputMemory, 
"half")) sprintf(vecTypeInput, 
"f16vec2");
 
16449    if (!strcmp(floatTypeInputMemory, 
"float")) sprintf(vecTypeInput, 
"vec2");
 
16450    if (!strcmp(floatTypeInputMemory, 
"double")) sprintf(vecTypeInput, 
"dvec2");
 
16451    if (!strcmp(floatTypeOutputMemory, 
"half")) sprintf(vecTypeOutput, 
"f16vec2");
 
16452    if (!strcmp(floatTypeOutputMemory, 
"float")) sprintf(vecTypeOutput, 
"vec2");
 
16453    if (!strcmp(floatTypeOutputMemory, 
"double")) sprintf(vecTypeOutput, 
"dvec2");
 
16466#elif(VKFFT_BACKEND==1) 
16467    if (!strcmp(floatType, 
"half")) sprintf(vecType, 
"f16vec2");
 
16468    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
16469    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
16470    if (!strcmp(floatTypeInputMemory, 
"half")) sprintf(vecTypeInput, 
"f16vec2");
 
16471    if (!strcmp(floatTypeInputMemory, 
"float")) sprintf(vecTypeInput, 
"float2");
 
16472    if (!strcmp(floatTypeInputMemory, 
"double")) sprintf(vecTypeInput, 
"double2");
 
16473    if (!strcmp(floatTypeOutputMemory, 
"half")) sprintf(vecTypeOutput, 
"f16vec2");
 
16474    if (!strcmp(floatTypeOutputMemory, 
"float")) sprintf(vecTypeOutput, 
"float2");
 
16475    if (!strcmp(floatTypeOutputMemory, 
"double")) sprintf(vecTypeOutput, 
"double2");
 
16488#elif(VKFFT_BACKEND==2) 
16489    if (!strcmp(floatType, 
"half")) sprintf(vecType, 
"f16vec2");
 
16490    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
16491    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
16492    if (!strcmp(floatTypeInputMemory, 
"half")) sprintf(vecTypeInput, 
"f16vec2");
 
16493    if (!strcmp(floatTypeInputMemory, 
"float")) sprintf(vecTypeInput, 
"float2");
 
16494    if (!strcmp(floatTypeInputMemory, 
"double")) sprintf(vecTypeInput, 
"double2");
 
16495    if (!strcmp(floatTypeOutputMemory, 
"half")) sprintf(vecTypeOutput, 
"f16vec2");
 
16496    if (!strcmp(floatTypeOutputMemory, 
"float")) sprintf(vecTypeOutput, 
"float2");
 
16497    if (!strcmp(floatTypeOutputMemory, 
"double")) sprintf(vecTypeOutput, 
"double2");
 
16510#elif(VKFFT_BACKEND==3) 
16511    if (!strcmp(floatType, 
"half")) sprintf(vecType, 
"f16vec2");
 
16512    if (!strcmp(floatType, 
"float")) sprintf(vecType, 
"float2");
 
16513    if (!strcmp(floatType, 
"double")) sprintf(vecType, 
"double2");
 
16514    if (!strcmp(floatTypeInputMemory, 
"half")) sprintf(vecTypeInput, 
"f16vec2");
 
16515    if (!strcmp(floatTypeInputMemory, 
"float")) sprintf(vecTypeInput, 
"float2");
 
16516    if (!strcmp(floatTypeInputMemory, 
"double")) sprintf(vecTypeInput, 
"double2");
 
16517    if (!strcmp(floatTypeOutputMemory, 
"half")) sprintf(vecTypeOutput, 
"f16vec2");
 
16518    if (!strcmp(floatTypeOutputMemory, 
"float")) sprintf(vecTypeOutput, 
"float2");
 
16519    if (!strcmp(floatTypeOutputMemory, 
"double")) sprintf(vecTypeOutput, 
"double2");
 
16535    sprintf(sc->
tshuffle, 
"tshuffle");
 
16538    sprintf(sc->
inoutID, 
"inoutID");
 
16539    sprintf(sc->
sdataID, 
"sdataID");
 
16558    res = 
appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
 
16573    if ((!sc->
LUT) && (!strcmp(floatType, 
"double"))) {
 
16580    if (strcmp(floatType, floatTypeInputMemory)) {
 
16587    if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
 
16648    uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->
axisSwapped)) ? 1 : type;
 
16649#if(VKFFT_BACKEND==0) 
16661#elif(VKFFT_BACKEND==1) 
16662    sc->
tempLen = sprintf(sc->
tempStr, 
"extern __shared__ float shared[];\n");
 
16677        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
 
16682        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
 
16687        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16692        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16697        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16702        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16707        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16712        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16717        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16722        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16727        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16732        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16737        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16742        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16747        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
 
16758        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* kernel_obj", vecType);
 
16766        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* twiddleLUT", vecType);
 
16774        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* BluesteinConvolutionKernel", vecType);
 
16782        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* BluesteinMultiplication", vecType);
 
16801#elif(VKFFT_BACKEND==2) 
16802    sc->
tempLen = sprintf(sc->
tempStr, 
"extern __shared__ float shared[];\n");
 
16817        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
 
16822        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
 
16827        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16832        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16837        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16842        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16847        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16852        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16857        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16862        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16867        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16872        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16877        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16882        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16887        sc->
tempLen = sprintf(sc->
tempStr, 
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
 
16897        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* kernel_obj", vecType);
 
16905        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* twiddleLUT", vecType);
 
16913        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* BluesteinConvolutionKernel", vecType);
 
16921        sc->
tempLen = sprintf(sc->
tempStr, 
", %s* BluesteinMultiplication", vecType);
 
16940#elif(VKFFT_BACKEND==3) 
16950        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput);
 
16955        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory);
 
16960        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16965        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16970        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16975        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16980        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16985        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16990        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
16995        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
17000        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
17005        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
17010        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
17015        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
 
17020        sc->
tempLen = sprintf(sc->
tempStr, 
"(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
 
17030        sc->
tempLen = sprintf(sc->
tempStr, 
", __global %s* kernel_obj", vecType);
 
17038        sc->
tempLen = sprintf(sc->
tempStr, 
", __global %s* twiddleLUT", vecType);
 
17046        sc->
tempLen = sprintf(sc->
tempStr, 
", __global %s* BluesteinConvolutionKernel", vecType);
 
17054        sc->
tempLen = sprintf(sc->
tempStr, 
", __global %s* BluesteinMultiplication", vecType);
 
17097        sc->
tempLen = sprintf(sc->
tempStr, 
"  for (%s coordinate=%" PRIu64 
"; coordinate > 0; coordinate--){\n\ 
17128    uint64_t stageSize = 1;
 
17129    uint64_t stageSizeSum = 0;
 
17130    double PI_const = 3.1415926535897932384626433832795;
 
17131    double stageAngle = (sc->
inverse) ? PI_const : -PI_const;
 
17132    for (uint64_t i = 0; i < sc->
numStages; i++) {
 
17154                stageSizeSum += stageSize;
 
17157                stageSizeSum += stageSize * 2;
 
17160                stageSizeSum += stageSize * 2;
 
17163                stageSizeSum += stageSize * 4;
 
17166                stageSizeSum += stageSize * 6;
 
17169                stageSizeSum += stageSize * 3;
 
17172                stageSizeSum += stageSize * 10;
 
17175                stageSizeSum += stageSize * 12;
 
17250        stageAngle = PI_const;
 
17252        for (uint64_t i = 0; i < sc->
numStages; i++) {
 
17260                stageSizeSum += stageSize;
 
17263                stageSizeSum += stageSize * 2;
 
17266                stageSizeSum += stageSize * 2;
 
17269                stageSizeSum += stageSize * 4;
 
17272                stageSizeSum += stageSize * 6;
 
17275                stageSizeSum += stageSize * 3;
 
17278                stageSizeSum += stageSize * 10;
 
17281                stageSizeSum += stageSize * 12;
 
 
17683static inline VkFFTResult VkFFTGetRegistersPerThread(uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) {
 
17684    for (uint64_t i = 0; i < 14; i++) {
 
17685        registers_per_thread_per_radix[i] = 0;
 
17687    registers_per_thread[0] = 0;
 
17688    min_registers_per_thread[0] = -1;
 
17690    if (loc_multipliers[2] > 0) {
 
17691        if (loc_multipliers[3] > 0) {
 
17692            if (loc_multipliers[5] > 0) {
 
17693                if (loc_multipliers[7] > 0) {
 
17694                    if (loc_multipliers[11] > 0) {
 
17695                        if (loc_multipliers[13] > 0) {
 
17696                            switch (loc_multipliers[2]) {
 
17698                                registers_per_thread_per_radix[2] = 14;
 
17699                                registers_per_thread_per_radix[3] = 15;
 
17702                                registers_per_thread_per_radix[2] = 12;
 
17703                                registers_per_thread_per_radix[3] = 12;
 
17706                                registers_per_thread_per_radix[2] = 12;
 
17707                                registers_per_thread_per_radix[3] = 12;
 
17710                                registers_per_thread_per_radix[2] = 16;
 
17711                                registers_per_thread_per_radix[3] = 12;
 
17714                            registers_per_thread_per_radix[5] = 15;
 
17715                            registers_per_thread_per_radix[7] = 14;
 
17716                            registers_per_thread_per_radix[11] = 11;
 
17717                            registers_per_thread_per_radix[13] = 13;
 
17720                            switch (loc_multipliers[2]) {
 
17722                                registers_per_thread_per_radix[2] = 14;
 
17723                                registers_per_thread_per_radix[3] = 15;
 
17726                                registers_per_thread_per_radix[2] = 12;
 
17727                                registers_per_thread_per_radix[3] = 12;
 
17730                                registers_per_thread_per_radix[2] = 12;
 
17731                                registers_per_thread_per_radix[3] = 12;
 
17734                                registers_per_thread_per_radix[2] = 16;
 
17735                                registers_per_thread_per_radix[3] = 12;
 
17738                            registers_per_thread_per_radix[5] = 15;
 
17739                            registers_per_thread_per_radix[7] = 14;
 
17740                            registers_per_thread_per_radix[11] = 11;
 
17741                            registers_per_thread_per_radix[13] = 0;
 
17745                        if (loc_multipliers[13] > 0) {
 
17746                            switch (loc_multipliers[2]) {
 
17748                                registers_per_thread_per_radix[2] = 14;
 
17749                                registers_per_thread_per_radix[3] = 15;
 
17752                                registers_per_thread_per_radix[2] = 12;
 
17753                                registers_per_thread_per_radix[3] = 12;
 
17756                                registers_per_thread_per_radix[2] = 12;
 
17757                                registers_per_thread_per_radix[3] = 12;
 
17760                                registers_per_thread_per_radix[2] = 16;
 
17761                                registers_per_thread_per_radix[3] = 12;
 
17764                            registers_per_thread_per_radix[5] = 15;
 
17765                            registers_per_thread_per_radix[7] = 14;
 
17766                            registers_per_thread_per_radix[11] = 0;
 
17767                            registers_per_thread_per_radix[13] = 13;
 
17771                            switch (loc_multipliers[2]) {
 
17773                                registers_per_thread_per_radix[2] = 14;
 
17774                                registers_per_thread_per_radix[3] = 15;
 
17778                                registers_per_thread_per_radix[2] = 12;
 
17779                                registers_per_thread_per_radix[3] = 12;
 
17782                                registers_per_thread_per_radix[2] = 12;
 
17783                                registers_per_thread_per_radix[3] = 12;
 
17786                                registers_per_thread_per_radix[2] = 16;
 
17787                                registers_per_thread_per_radix[3] = 12;
 
17790                            registers_per_thread_per_radix[5] = 15;
 
17791                            registers_per_thread_per_radix[7] = 14;
 
17792                            registers_per_thread_per_radix[11] = 0;
 
17793                            registers_per_thread_per_radix[13] = 0;
 
17798                    if (loc_multipliers[11] > 0) {
 
17799                        if (loc_multipliers[13] > 0) {
 
17800                            switch (loc_multipliers[2]) {
 
17802                                registers_per_thread_per_radix[2] = 10;
 
17803                                registers_per_thread_per_radix[3] = 15;
 
17806                                registers_per_thread_per_radix[2] = 12;
 
17807                                registers_per_thread_per_radix[3] = 12;
 
17810                                registers_per_thread_per_radix[2] = 12;
 
17811                                registers_per_thread_per_radix[3] = 12;
 
17814                            registers_per_thread_per_radix[5] = 10;
 
17815                            registers_per_thread_per_radix[7] = 0;
 
17816                            registers_per_thread_per_radix[11] = 11;
 
17817                            registers_per_thread_per_radix[13] = 13;
 
17820                            switch (loc_multipliers[2]) {
 
17822                                registers_per_thread_per_radix[2] = 10;
 
17823                                registers_per_thread_per_radix[3] = 15;
 
17826                                registers_per_thread_per_radix[2] = 12;
 
17827                                registers_per_thread_per_radix[3] = 12;
 
17830                                registers_per_thread_per_radix[2] = 12;
 
17831                                registers_per_thread_per_radix[3] = 12;
 
17834                            registers_per_thread_per_radix[5] = 10;
 
17835                            registers_per_thread_per_radix[7] = 0;
 
17836                            registers_per_thread_per_radix[11] = 11;
 
17837                            registers_per_thread_per_radix[13] = 0;
 
17841                        if (loc_multipliers[13] > 0) {
 
17842                            switch (loc_multipliers[2]) {
 
17844                                registers_per_thread_per_radix[2] = 10;
 
17845                                registers_per_thread_per_radix[3] = 15;
 
17848                                registers_per_thread_per_radix[2] = 12;
 
17849                                registers_per_thread_per_radix[3] = 12;
 
17852                                registers_per_thread_per_radix[2] = 12;
 
17853                                registers_per_thread_per_radix[3] = 12;
 
17856                            registers_per_thread_per_radix[5] = 10;
 
17857                            registers_per_thread_per_radix[7] = 0;
 
17858                            registers_per_thread_per_radix[11] = 0;
 
17859                            registers_per_thread_per_radix[13] = 13;
 
17862                            switch (loc_multipliers[2]) {
 
17864                                registers_per_thread_per_radix[2] = 6;
 
17865                                registers_per_thread_per_radix[3] = 6;
 
17866                                registers_per_thread_per_radix[5] = 5;
 
17869                                registers_per_thread_per_radix[2] = 12;
 
17870                                registers_per_thread_per_radix[3] = 12;
 
17871                                registers_per_thread_per_radix[5] = 10;
 
17874                                registers_per_thread_per_radix[2] = 12;
 
17875                                registers_per_thread_per_radix[3] = 12;
 
17876                                registers_per_thread_per_radix[5] = 10;
 
17879                            registers_per_thread_per_radix[7] = 0;
 
17880                            registers_per_thread_per_radix[11] = 0;
 
17881                            registers_per_thread_per_radix[13] = 0;
 
17889                if (loc_multipliers[7] > 0) {
 
17890                    if (loc_multipliers[11] > 0) {
 
17891                        if (loc_multipliers[13] > 0) {
 
17892                            switch (loc_multipliers[2]) {
 
17894                                registers_per_thread_per_radix[2] = 22;
 
17895                                registers_per_thread_per_radix[3] = 21;
 
17896                                registers_per_thread_per_radix[5] = 0;
 
17897                                registers_per_thread_per_radix[7] = 21;
 
17898                                registers_per_thread_per_radix[11] = 22;
 
17899                                registers_per_thread_per_radix[13] = 26;
 
17902                                registers_per_thread_per_radix[2] = 12;
 
17903                                registers_per_thread_per_radix[3] = 12;
 
17904                                registers_per_thread_per_radix[5] = 0;
 
17905                                registers_per_thread_per_radix[7] = 14;
 
17906                                registers_per_thread_per_radix[11] = 11;
 
17907                                registers_per_thread_per_radix[13] = 13;
 
17910                                registers_per_thread_per_radix[2] = 12;
 
17911                                registers_per_thread_per_radix[3] = 12;
 
17912                                registers_per_thread_per_radix[5] = 0;
 
17913                                registers_per_thread_per_radix[7] = 14;
 
17914                                registers_per_thread_per_radix[11] = 11;
 
17915                                registers_per_thread_per_radix[13] = 13;
 
17920                            switch (loc_multipliers[2]) {
 
17922                                registers_per_thread_per_radix[2] = 22;
 
17923                                registers_per_thread_per_radix[3] = 21;
 
17924                                registers_per_thread_per_radix[5] = 0;
 
17925                                registers_per_thread_per_radix[7] = 21;
 
17926                                registers_per_thread_per_radix[11] = 22;
 
17927                                registers_per_thread_per_radix[13] = 0;
 
17930                                registers_per_thread_per_radix[2] = 12;
 
17931                                registers_per_thread_per_radix[3] = 12;
 
17932                                registers_per_thread_per_radix[5] = 0;
 
17933                                registers_per_thread_per_radix[7] = 14;
 
17934                                registers_per_thread_per_radix[11] = 11;
 
17935                                registers_per_thread_per_radix[13] = 0;
 
17938                                registers_per_thread_per_radix[2] = 12;
 
17939                                registers_per_thread_per_radix[3] = 12;
 
17940                                registers_per_thread_per_radix[5] = 0;
 
17941                                registers_per_thread_per_radix[7] = 14;
 
17942                                registers_per_thread_per_radix[11] = 11;
 
17943                                registers_per_thread_per_radix[13] = 0;
 
17949                        if (loc_multipliers[13] > 0) {
 
17950                            switch (loc_multipliers[2]) {
 
17952                                registers_per_thread_per_radix[2] = 26;
 
17953                                registers_per_thread_per_radix[3] = 21;
 
17954                                registers_per_thread_per_radix[5] = 0;
 
17955                                registers_per_thread_per_radix[7] = 21;
 
17956                                registers_per_thread_per_radix[11] = 0;
 
17957                                registers_per_thread_per_radix[13] = 26;
 
17960                                registers_per_thread_per_radix[2] = 12;
 
17961                                registers_per_thread_per_radix[3] = 12;
 
17962                                registers_per_thread_per_radix[5] = 0;
 
17963                                registers_per_thread_per_radix[7] = 14;
 
17964                                registers_per_thread_per_radix[11] = 0;
 
17965                                registers_per_thread_per_radix[13] = 13;
 
17968                                registers_per_thread_per_radix[2] = 12;
 
17969                                registers_per_thread_per_radix[3] = 12;
 
17970                                registers_per_thread_per_radix[5] = 0;
 
17971                                registers_per_thread_per_radix[7] = 14;
 
17972                                registers_per_thread_per_radix[11] = 0;
 
17973                                registers_per_thread_per_radix[13] = 13;
 
17978                            switch (loc_multipliers[2]) {
 
17980                                registers_per_thread_per_radix[2] = 6;
 
17981                                registers_per_thread_per_radix[3] = 6;
 
17982                                registers_per_thread_per_radix[5] = 0;
 
17983                                registers_per_thread_per_radix[7] = 7;
 
17984                                registers_per_thread_per_radix[11] = 0;
 
17985                                registers_per_thread_per_radix[13] = 0;
 
17988                                registers_per_thread_per_radix[2] = 6;
 
17989                                registers_per_thread_per_radix[3] = 6;
 
17990                                registers_per_thread_per_radix[5] = 0;
 
17991                                registers_per_thread_per_radix[7] = 7;
 
17992                                registers_per_thread_per_radix[11] = 0;
 
17993                                registers_per_thread_per_radix[13] = 0;
 
17996                                registers_per_thread_per_radix[2] = 8;
 
17997                                registers_per_thread_per_radix[3] = 6;
 
17998                                registers_per_thread_per_radix[5] = 0;
 
17999                                registers_per_thread_per_radix[7] = 7;
 
18000                                registers_per_thread_per_radix[11] = 0;
 
18001                                registers_per_thread_per_radix[13] = 0;
 
18008                    if (loc_multipliers[11] > 0) {
 
18009                        if (loc_multipliers[13] > 0) {
 
18010                            switch (loc_multipliers[2]) {
 
18012                                registers_per_thread_per_radix[2] = 6;
 
18013                                registers_per_thread_per_radix[3] = 6;
 
18014                                registers_per_thread_per_radix[5] = 0;
 
18015                                registers_per_thread_per_radix[7] = 0;
 
18016                                registers_per_thread_per_radix[11] = 11;
 
18017                                registers_per_thread_per_radix[13] = 13;
 
18020                                registers_per_thread_per_radix[2] = 12;
 
18021                                registers_per_thread_per_radix[3] = 12;
 
18022                                registers_per_thread_per_radix[5] = 0;
 
18023                                registers_per_thread_per_radix[7] = 0;
 
18024                                registers_per_thread_per_radix[11] = 11;
 
18025                                registers_per_thread_per_radix[13] = 13;
 
18028                                registers_per_thread_per_radix[2] = 12;
 
18029                                registers_per_thread_per_radix[3] = 12;
 
18030                                registers_per_thread_per_radix[5] = 0;
 
18031                                registers_per_thread_per_radix[7] = 0;
 
18032                                registers_per_thread_per_radix[11] = 11;
 
18033                                registers_per_thread_per_radix[13] = 13;
 
18038                            switch (loc_multipliers[2]) {
 
18040                                registers_per_thread_per_radix[2] = 6;
 
18041                                registers_per_thread_per_radix[3] = 6;
 
18042                                registers_per_thread_per_radix[5] = 0;
 
18043                                registers_per_thread_per_radix[7] = 0;
 
18044                                registers_per_thread_per_radix[11] = 11;
 
18045                                registers_per_thread_per_radix[13] = 0;
 
18048                                registers_per_thread_per_radix[2] = 12;
 
18049                                registers_per_thread_per_radix[3] = 12;
 
18050                                registers_per_thread_per_radix[5] = 0;
 
18051                                registers_per_thread_per_radix[7] = 0;
 
18052                                registers_per_thread_per_radix[11] = 11;
 
18053                                registers_per_thread_per_radix[13] = 0;
 
18056                                registers_per_thread_per_radix[2] = 12;
 
18057                                registers_per_thread_per_radix[3] = 12;
 
18058                                registers_per_thread_per_radix[5] = 0;
 
18059                                registers_per_thread_per_radix[7] = 0;
 
18060                                registers_per_thread_per_radix[11] = 11;
 
18061                                registers_per_thread_per_radix[13] = 0;
 
18067                        if (loc_multipliers[13] > 0) {
 
18068                            switch (loc_multipliers[2]) {
 
18070                                registers_per_thread_per_radix[2] = 6;
 
18071                                registers_per_thread_per_radix[3] = 6;
 
18072                                registers_per_thread_per_radix[5] = 0;
 
18073                                registers_per_thread_per_radix[7] = 0;
 
18074                                registers_per_thread_per_radix[11] = 0;
 
18075                                registers_per_thread_per_radix[13] = 13;
 
18078                                registers_per_thread_per_radix[2] = 12;
 
18079                                registers_per_thread_per_radix[3] = 12;
 
18080                                registers_per_thread_per_radix[5] = 0;
 
18081                                registers_per_thread_per_radix[7] = 0;
 
18082                                registers_per_thread_per_radix[11] = 0;
 
18083                                registers_per_thread_per_radix[13] = 13;
 
18086                                registers_per_thread_per_radix[2] = 12;
 
18087                                registers_per_thread_per_radix[3] = 12;
 
18088                                registers_per_thread_per_radix[5] = 0;
 
18089                                registers_per_thread_per_radix[7] = 0;
 
18090                                registers_per_thread_per_radix[11] = 0;
 
18091                                registers_per_thread_per_radix[13] = 13;
 
18096                            switch (loc_multipliers[2]) {
 
18098                                registers_per_thread_per_radix[2] = 6;
 
18099                                registers_per_thread_per_radix[3] = 6;
 
18100                                registers_per_thread_per_radix[5] = 0;
 
18101                                registers_per_thread_per_radix[7] = 0;
 
18102                                registers_per_thread_per_radix[11] = 0;
 
18103                                registers_per_thread_per_radix[13] = 0;
 
18106                                registers_per_thread_per_radix[2] = 12;
 
18107                                registers_per_thread_per_radix[3] = 12;
 
18108                                registers_per_thread_per_radix[5] = 0;
 
18109                                registers_per_thread_per_radix[7] = 0;
 
18110                                registers_per_thread_per_radix[11] = 0;
 
18111                                registers_per_thread_per_radix[13] = 0;
 
18114                                registers_per_thread_per_radix[2] = 12;
 
18115                                registers_per_thread_per_radix[3] = 12;
 
18116                                registers_per_thread_per_radix[5] = 0;
 
18117                                registers_per_thread_per_radix[7] = 0;
 
18118                                registers_per_thread_per_radix[11] = 0;
 
18119                                registers_per_thread_per_radix[13] = 0;
 
18128            if (loc_multipliers[5] > 0) {
 
18129                if (loc_multipliers[7] > 0) {
 
18130                    if (loc_multipliers[11] > 0) {
 
18131                        if (loc_multipliers[13] > 0) {
 
18132                            switch (loc_multipliers[2]) {
 
18134                                registers_per_thread_per_radix[2] = 10;
 
18135                                registers_per_thread_per_radix[3] = 0;
 
18136                                registers_per_thread_per_radix[5] = 10;
 
18137                                registers_per_thread_per_radix[7] = 14;
 
18138                                registers_per_thread_per_radix[11] = 11;
 
18139                                registers_per_thread_per_radix[13] = 13;
 
18142                                registers_per_thread_per_radix[2] = 10;
 
18143                                registers_per_thread_per_radix[3] = 0;
 
18144                                registers_per_thread_per_radix[5] = 10;
 
18145                                registers_per_thread_per_radix[7] = 14;
 
18146                                registers_per_thread_per_radix[11] = 11;
 
18147                                registers_per_thread_per_radix[13] = 13;
 
18150                                registers_per_thread_per_radix[2] = 8;
 
18151                                registers_per_thread_per_radix[3] = 0;
 
18152                                registers_per_thread_per_radix[5] = 10;
 
18153                                registers_per_thread_per_radix[7] = 14;
 
18154                                registers_per_thread_per_radix[11] = 11;
 
18155                                registers_per_thread_per_radix[13] = 13;
 
18158                                registers_per_thread_per_radix[2] = 16;
 
18159                                registers_per_thread_per_radix[3] = 0;
 
18160                                registers_per_thread_per_radix[5] = 10;
 
18161                                registers_per_thread_per_radix[7] = 14;
 
18162                                registers_per_thread_per_radix[11] = 11;
 
18163                                registers_per_thread_per_radix[13] = 13;
 
18168                            switch (loc_multipliers[2]) {
 
18170                                registers_per_thread_per_radix[2] = 10;
 
18171                                registers_per_thread_per_radix[3] = 0;
 
18172                                registers_per_thread_per_radix[5] = 10;
 
18173                                registers_per_thread_per_radix[7] = 14;
 
18174                                registers_per_thread_per_radix[11] = 11;
 
18175                                registers_per_thread_per_radix[13] = 0;
 
18178                                registers_per_thread_per_radix[2] = 10;
 
18179                                registers_per_thread_per_radix[3] = 0;
 
18180                                registers_per_thread_per_radix[5] = 10;
 
18181                                registers_per_thread_per_radix[7] = 14;
 
18182                                registers_per_thread_per_radix[11] = 11;
 
18183                                registers_per_thread_per_radix[13] = 0;
 
18186                                registers_per_thread_per_radix[2] = 8;
 
18187                                registers_per_thread_per_radix[3] = 0;
 
18188                                registers_per_thread_per_radix[5] = 10;
 
18189                                registers_per_thread_per_radix[7] = 14;
 
18190                                registers_per_thread_per_radix[11] = 11;
 
18191                                registers_per_thread_per_radix[13] = 0;
 
18194                                registers_per_thread_per_radix[2] = 16;
 
18195                                registers_per_thread_per_radix[3] = 0;
 
18196                                registers_per_thread_per_radix[5] = 10;
 
18197                                registers_per_thread_per_radix[7] = 14;
 
18198                                registers_per_thread_per_radix[11] = 11;
 
18199                                registers_per_thread_per_radix[13] = 0;
 
18205                        if (loc_multipliers[13] > 0) {
 
18206                            switch (loc_multipliers[2]) {
 
18208                                registers_per_thread_per_radix[2] = 10;
 
18209                                registers_per_thread_per_radix[3] = 0;
 
18210                                registers_per_thread_per_radix[5] = 10;
 
18211                                registers_per_thread_per_radix[7] = 14;
 
18212                                registers_per_thread_per_radix[11] = 0;
 
18213                                registers_per_thread_per_radix[13] = 13;
 
18216                                registers_per_thread_per_radix[2] = 10;
 
18217                                registers_per_thread_per_radix[3] = 0;
 
18218                                registers_per_thread_per_radix[5] = 10;
 
18219                                registers_per_thread_per_radix[7] = 14;
 
18220                                registers_per_thread_per_radix[11] = 0;
 
18221                                registers_per_thread_per_radix[13] = 13;
 
18224                                registers_per_thread_per_radix[2] = 8;
 
18225                                registers_per_thread_per_radix[3] = 0;
 
18226                                registers_per_thread_per_radix[5] = 10;
 
18227                                registers_per_thread_per_radix[7] = 14;
 
18228                                registers_per_thread_per_radix[11] = 0;
 
18229                                registers_per_thread_per_radix[13] = 13;
 
18232                                registers_per_thread_per_radix[2] = 16;
 
18233                                registers_per_thread_per_radix[3] = 0;
 
18234                                registers_per_thread_per_radix[5] = 10;
 
18235                                registers_per_thread_per_radix[7] = 14;
 
18236                                registers_per_thread_per_radix[11] = 0;
 
18237                                registers_per_thread_per_radix[13] = 13;
 
18242                            switch (loc_multipliers[2]) {
 
18244                                registers_per_thread_per_radix[2] = 10;
 
18245                                registers_per_thread_per_radix[3] = 0;
 
18246                                registers_per_thread_per_radix[5] = 10;
 
18247                                registers_per_thread_per_radix[7] = 7;
 
18248                                registers_per_thread_per_radix[11] = 0;
 
18249                                registers_per_thread_per_radix[13] = 0;
 
18252                                registers_per_thread_per_radix[2] = 10;
 
18253                                registers_per_thread_per_radix[3] = 0;
 
18254                                registers_per_thread_per_radix[5] = 10;
 
18255                                registers_per_thread_per_radix[7] = 7;
 
18256                                registers_per_thread_per_radix[11] = 0;
 
18257                                registers_per_thread_per_radix[13] = 0;
 
18260                                registers_per_thread_per_radix[2] = 8;
 
18261                                registers_per_thread_per_radix[3] = 0;
 
18262                                registers_per_thread_per_radix[5] = 10;
 
18263                                registers_per_thread_per_radix[7] = 7;
 
18264                                registers_per_thread_per_radix[11] = 0;
 
18265                                registers_per_thread_per_radix[13] = 0;
 
18272                    if (loc_multipliers[11] > 0) {
 
18273                        if (loc_multipliers[13] > 0) {
 
18274                            switch (loc_multipliers[2]) {
 
18276                                registers_per_thread_per_radix[2] = 10;
 
18277                                registers_per_thread_per_radix[3] = 0;
 
18278                                registers_per_thread_per_radix[5] = 10;
 
18279                                registers_per_thread_per_radix[7] = 0;
 
18280                                registers_per_thread_per_radix[11] = 11;
 
18281                                registers_per_thread_per_radix[13] = 13;
 
18284                                registers_per_thread_per_radix[2] = 10;
 
18285                                registers_per_thread_per_radix[3] = 0;
 
18286                                registers_per_thread_per_radix[5] = 10;
 
18287                                registers_per_thread_per_radix[7] = 0;
 
18288                                registers_per_thread_per_radix[11] = 11;
 
18289                                registers_per_thread_per_radix[13] = 13;
 
18292                                registers_per_thread_per_radix[2] = 8;
 
18293                                registers_per_thread_per_radix[3] = 0;
 
18294                                registers_per_thread_per_radix[5] = 10;
 
18295                                registers_per_thread_per_radix[7] = 0;
 
18296                                registers_per_thread_per_radix[11] = 11;
 
18297                                registers_per_thread_per_radix[13] = 13;
 
18302                            switch (loc_multipliers[2]) {
 
18304                                registers_per_thread_per_radix[2] = 10;
 
18305                                registers_per_thread_per_radix[3] = 0;
 
18306                                registers_per_thread_per_radix[5] = 10;
 
18307                                registers_per_thread_per_radix[7] = 0;
 
18308                                registers_per_thread_per_radix[11] = 11;
 
18309                                registers_per_thread_per_radix[13] = 0;
 
18312                                registers_per_thread_per_radix[2] = 10;
 
18313                                registers_per_thread_per_radix[3] = 0;
 
18314                                registers_per_thread_per_radix[5] = 10;
 
18315                                registers_per_thread_per_radix[7] = 0;
 
18316                                registers_per_thread_per_radix[11] = 11;
 
18317                                registers_per_thread_per_radix[13] = 0;
 
18320                                registers_per_thread_per_radix[2] = 8;
 
18321                                registers_per_thread_per_radix[3] = 0;
 
18322                                registers_per_thread_per_radix[5] = 10;
 
18323                                registers_per_thread_per_radix[7] = 0;
 
18324                                registers_per_thread_per_radix[11] = 11;
 
18325                                registers_per_thread_per_radix[13] = 0;
 
18331                        if (loc_multipliers[13] > 0) {
 
18332                            switch (loc_multipliers[2]) {
 
18334                                registers_per_thread_per_radix[2] = 10;
 
18335                                registers_per_thread_per_radix[3] = 0;
 
18336                                registers_per_thread_per_radix[5] = 10;
 
18337                                registers_per_thread_per_radix[7] = 0;
 
18338                                registers_per_thread_per_radix[11] = 0;
 
18339                                registers_per_thread_per_radix[13] = 13;
 
18342                                registers_per_thread_per_radix[2] = 10;
 
18343                                registers_per_thread_per_radix[3] = 0;
 
18344                                registers_per_thread_per_radix[5] = 10;
 
18345                                registers_per_thread_per_radix[7] = 0;
 
18346                                registers_per_thread_per_radix[11] = 0;
 
18347                                registers_per_thread_per_radix[13] = 13;
 
18350                                registers_per_thread_per_radix[2] = 8;
 
18351                                registers_per_thread_per_radix[3] = 0;
 
18352                                registers_per_thread_per_radix[5] = 10;
 
18353                                registers_per_thread_per_radix[7] = 0;
 
18354                                registers_per_thread_per_radix[11] = 0;
 
18355                                registers_per_thread_per_radix[13] = 13;
 
18360                            switch (loc_multipliers[2]) {
 
18362                                registers_per_thread_per_radix[2] = 10;
 
18363                                registers_per_thread_per_radix[3] = 0;
 
18364                                registers_per_thread_per_radix[5] = 10;
 
18365                                registers_per_thread_per_radix[7] = 0;
 
18366                                registers_per_thread_per_radix[11] = 0;
 
18367                                registers_per_thread_per_radix[13] = 0;
 
18370                                registers_per_thread_per_radix[2] = 10;
 
18371                                registers_per_thread_per_radix[3] = 0;
 
18372                                registers_per_thread_per_radix[5] = 10;
 
18373                                registers_per_thread_per_radix[7] = 0;
 
18374                                registers_per_thread_per_radix[11] = 0;
 
18375                                registers_per_thread_per_radix[13] = 0;
 
18378                                registers_per_thread_per_radix[2] = 8;
 
18379                                registers_per_thread_per_radix[3] = 0;
 
18380                                registers_per_thread_per_radix[5] = 10;
 
18381                                registers_per_thread_per_radix[7] = 0;
 
18382                                registers_per_thread_per_radix[11] = 0;
 
18383                                registers_per_thread_per_radix[13] = 0;
 
18392                if (loc_multipliers[7] > 0) {
 
18393                    if (loc_multipliers[11] > 0) {
 
18394                        if (loc_multipliers[13] > 0) {
 
18395                            switch (loc_multipliers[2]) {
 
18397                                registers_per_thread_per_radix[2] = 14;
 
18398                                registers_per_thread_per_radix[3] = 0;
 
18399                                registers_per_thread_per_radix[5] = 0;
 
18400                                registers_per_thread_per_radix[7] = 14;
 
18401                                registers_per_thread_per_radix[11] = 11;
 
18402                                registers_per_thread_per_radix[13] = 13;
 
18405                                registers_per_thread_per_radix[2] = 14;
 
18406                                registers_per_thread_per_radix[3] = 0;
 
18407                                registers_per_thread_per_radix[5] = 0;
 
18408                                registers_per_thread_per_radix[7] = 14;
 
18409                                registers_per_thread_per_radix[11] = 11;
 
18410                                registers_per_thread_per_radix[13] = 13;
 
18413                                registers_per_thread_per_radix[2] = 8;
 
18414                                registers_per_thread_per_radix[3] = 0;
 
18415                                registers_per_thread_per_radix[5] = 0;
 
18416                                registers_per_thread_per_radix[7] = 14;
 
18417                                registers_per_thread_per_radix[11] = 11;
 
18418                                registers_per_thread_per_radix[13] = 13;
 
18421                                registers_per_thread_per_radix[2] = 16;
 
18422                                registers_per_thread_per_radix[3] = 0;
 
18423                                registers_per_thread_per_radix[5] = 0;
 
18424                                registers_per_thread_per_radix[7] = 14;
 
18425                                registers_per_thread_per_radix[11] = 11;
 
18426                                registers_per_thread_per_radix[13] = 13;
 
18431                            switch (loc_multipliers[2]) {
 
18433                                registers_per_thread_per_radix[2] = 14;
 
18434                                registers_per_thread_per_radix[3] = 0;
 
18435                                registers_per_thread_per_radix[5] = 0;
 
18436                                registers_per_thread_per_radix[7] = 14;
 
18437                                registers_per_thread_per_radix[11] = 11;
 
18438                                registers_per_thread_per_radix[13] = 0;
 
18441                                registers_per_thread_per_radix[2] = 14;
 
18442                                registers_per_thread_per_radix[3] = 0;
 
18443                                registers_per_thread_per_radix[5] = 0;
 
18444                                registers_per_thread_per_radix[7] = 14;
 
18445                                registers_per_thread_per_radix[11] = 11;
 
18446                                registers_per_thread_per_radix[13] = 0;
 
18449                                registers_per_thread_per_radix[2] = 8;
 
18450                                registers_per_thread_per_radix[3] = 0;
 
18451                                registers_per_thread_per_radix[5] = 0;
 
18452                                registers_per_thread_per_radix[7] = 14;
 
18453                                registers_per_thread_per_radix[11] = 11;
 
18454                                registers_per_thread_per_radix[13] = 0;
 
18457                                registers_per_thread_per_radix[2] = 16;
 
18458                                registers_per_thread_per_radix[3] = 0;
 
18459                                registers_per_thread_per_radix[5] = 0;
 
18460                                registers_per_thread_per_radix[7] = 14;
 
18461                                registers_per_thread_per_radix[11] = 11;
 
18462                                registers_per_thread_per_radix[13] = 0;
 
18468                        if (loc_multipliers[13] > 0) {
 
18469                            switch (loc_multipliers[2]) {
 
18471                                registers_per_thread_per_radix[2] = 14;
 
18472                                registers_per_thread_per_radix[3] = 0;
 
18473                                registers_per_thread_per_radix[5] = 0;
 
18474                                registers_per_thread_per_radix[7] = 14;
 
18475                                registers_per_thread_per_radix[11] = 0;
 
18476                                registers_per_thread_per_radix[13] = 13;
 
18479                                registers_per_thread_per_radix[2] = 14;
 
18480                                registers_per_thread_per_radix[3] = 0;
 
18481                                registers_per_thread_per_radix[5] = 0;
 
18482                                registers_per_thread_per_radix[7] = 14;
 
18483                                registers_per_thread_per_radix[11] = 0;
 
18484                                registers_per_thread_per_radix[13] = 13;
 
18487                                registers_per_thread_per_radix[2] = 8;
 
18488                                registers_per_thread_per_radix[3] = 0;
 
18489                                registers_per_thread_per_radix[5] = 0;
 
18490                                registers_per_thread_per_radix[7] = 14;
 
18491                                registers_per_thread_per_radix[11] = 0;
 
18492                                registers_per_thread_per_radix[13] = 13;
 
18495                                registers_per_thread_per_radix[2] = 16;
 
18496                                registers_per_thread_per_radix[3] = 0;
 
18497                                registers_per_thread_per_radix[5] = 0;
 
18498                                registers_per_thread_per_radix[7] = 14;
 
18499                                registers_per_thread_per_radix[11] = 0;
 
18500                                registers_per_thread_per_radix[13] = 13;
 
18505                            switch (loc_multipliers[2]) {
 
18507                                registers_per_thread_per_radix[2] = 14;
 
18508                                registers_per_thread_per_radix[3] = 0;
 
18509                                registers_per_thread_per_radix[5] = 0;
 
18510                                registers_per_thread_per_radix[7] = 14;
 
18511                                registers_per_thread_per_radix[11] = 0;
 
18512                                registers_per_thread_per_radix[13] = 0;
 
18515                                registers_per_thread_per_radix[2] = 14;
 
18516                                registers_per_thread_per_radix[3] = 0;
 
18517                                registers_per_thread_per_radix[5] = 0;
 
18518                                registers_per_thread_per_radix[7] = 14;
 
18519                                registers_per_thread_per_radix[11] = 0;
 
18520                                registers_per_thread_per_radix[13] = 0;
 
18523                                registers_per_thread_per_radix[2] = 14;
 
18524                                registers_per_thread_per_radix[3] = 0;
 
18525                                registers_per_thread_per_radix[5] = 0;
 
18526                                registers_per_thread_per_radix[7] = 14;
 
18527                                registers_per_thread_per_radix[11] = 0;
 
18528                                registers_per_thread_per_radix[13] = 0;
 
18531                                registers_per_thread_per_radix[2] = 14;
 
18532                                registers_per_thread_per_radix[3] = 0;
 
18533                                registers_per_thread_per_radix[5] = 0;
 
18534                                registers_per_thread_per_radix[7] = 14;
 
18535                                registers_per_thread_per_radix[11] = 0;
 
18536                                registers_per_thread_per_radix[13] = 0;
 
18543                    if (loc_multipliers[11] > 0) {
 
18544                        if (loc_multipliers[13] > 0) {
 
18545                            switch (loc_multipliers[2]) {
 
18547                                registers_per_thread_per_radix[2] = 22;
 
18548                                registers_per_thread_per_radix[3] = 0;
 
18549                                registers_per_thread_per_radix[5] = 0;
 
18550                                registers_per_thread_per_radix[7] = 0;
 
18551                                registers_per_thread_per_radix[11] = 22;
 
18552                                registers_per_thread_per_radix[13] = 26;
 
18555                                registers_per_thread_per_radix[2] = 22;
 
18556                                registers_per_thread_per_radix[3] = 0;
 
18557                                registers_per_thread_per_radix[5] = 0;
 
18558                                registers_per_thread_per_radix[7] = 0;
 
18559                                registers_per_thread_per_radix[11] = 22;
 
18560                                registers_per_thread_per_radix[13] = 26;
 
18563                                registers_per_thread_per_radix[2] = 8;
 
18564                                registers_per_thread_per_radix[3] = 0;
 
18565                                registers_per_thread_per_radix[5] = 0;
 
18566                                registers_per_thread_per_radix[7] = 0;
 
18567                                registers_per_thread_per_radix[11] = 11;
 
18568                                registers_per_thread_per_radix[13] = 13;
 
18573                            switch (loc_multipliers[2]) {
 
18575                                registers_per_thread_per_radix[2] = 22;
 
18576                                registers_per_thread_per_radix[3] = 0;
 
18577                                registers_per_thread_per_radix[5] = 0;
 
18578                                registers_per_thread_per_radix[7] = 0;
 
18579                                registers_per_thread_per_radix[11] = 22;
 
18580                                registers_per_thread_per_radix[13] = 0;
 
18583                                registers_per_thread_per_radix[2] = 22;
 
18584                                registers_per_thread_per_radix[3] = 0;
 
18585                                registers_per_thread_per_radix[5] = 0;
 
18586                                registers_per_thread_per_radix[7] = 0;
 
18587                                registers_per_thread_per_radix[11] = 22;
 
18588                                registers_per_thread_per_radix[13] = 0;
 
18591                                registers_per_thread_per_radix[2] = 8;
 
18592                                registers_per_thread_per_radix[3] = 0;
 
18593                                registers_per_thread_per_radix[5] = 0;
 
18594                                registers_per_thread_per_radix[7] = 0;
 
18595                                registers_per_thread_per_radix[11] = 11;
 
18596                                registers_per_thread_per_radix[13] = 0;
 
18599                                registers_per_thread_per_radix[2] = 8;
 
18600                                registers_per_thread_per_radix[3] = 0;
 
18601                                registers_per_thread_per_radix[5] = 0;
 
18602                                registers_per_thread_per_radix[7] = 0;
 
18603                                registers_per_thread_per_radix[11] = 11;
 
18604                                registers_per_thread_per_radix[13] = 0;
 
18610                        if (loc_multipliers[13] > 0) {
 
18611                            switch (loc_multipliers[2]) {
 
18613                                registers_per_thread_per_radix[2] = 26;
 
18614                                registers_per_thread_per_radix[3] = 0;
 
18615                                registers_per_thread_per_radix[5] = 0;
 
18616                                registers_per_thread_per_radix[7] = 0;
 
18617                                registers_per_thread_per_radix[11] = 0;
 
18618                                registers_per_thread_per_radix[13] = 26;
 
18621                                registers_per_thread_per_radix[2] = 26;
 
18622                                registers_per_thread_per_radix[3] = 0;
 
18623                                registers_per_thread_per_radix[5] = 0;
 
18624                                registers_per_thread_per_radix[7] = 0;
 
18625                                registers_per_thread_per_radix[11] = 0;
 
18626                                registers_per_thread_per_radix[13] = 26;
 
18629                                registers_per_thread_per_radix[2] = 8;
 
18630                                registers_per_thread_per_radix[3] = 0;
 
18631                                registers_per_thread_per_radix[5] = 0;
 
18632                                registers_per_thread_per_radix[7] = 0;
 
18633                                registers_per_thread_per_radix[11] = 0;
 
18634                                registers_per_thread_per_radix[13] = 13;
 
18639                            registers_per_thread_per_radix[2] = (loc_multipliers[2] > 2) ? 8 : (uint64_t)pow(2, loc_multipliers[2]);
 
18640                            registers_per_thread_per_radix[3] = 0;
 
18641                            registers_per_thread_per_radix[5] = 0;
 
18642                            registers_per_thread_per_radix[7] = 0;
 
18643                            registers_per_thread_per_radix[11] = 0;
 
18644                            registers_per_thread_per_radix[13] = 0;
 
18652        if (loc_multipliers[3] > 0) {
 
18653            if (loc_multipliers[5] > 0) {
 
18654                if (loc_multipliers[7] > 0) {
 
18655                    if (loc_multipliers[11] > 0) {
 
18656                        if (loc_multipliers[13] > 0) {
 
18657                            registers_per_thread_per_radix[2] = 0;
 
18658                            registers_per_thread_per_radix[3] = 15;
 
18659                            registers_per_thread_per_radix[5] = 15;
 
18660                            registers_per_thread_per_radix[7] = 21;
 
18661                            registers_per_thread_per_radix[11] = 11;
 
18662                            registers_per_thread_per_radix[13] = 13;
 
18665                            registers_per_thread_per_radix[2] = 0;
 
18666                            registers_per_thread_per_radix[3] = 15;
 
18667                            registers_per_thread_per_radix[5] = 15;
 
18668                            registers_per_thread_per_radix[7] = 21;
 
18669                            registers_per_thread_per_radix[11] = 11;
 
18670                            registers_per_thread_per_radix[13] = 0;
 
18674                        if (loc_multipliers[13] > 0) {
 
18675                            registers_per_thread_per_radix[2] = 0;
 
18676                            registers_per_thread_per_radix[3] = 15;
 
18677                            registers_per_thread_per_radix[5] = 15;
 
18678                            registers_per_thread_per_radix[7] = 21;
 
18679                            registers_per_thread_per_radix[11] = 0;
 
18680                            registers_per_thread_per_radix[13] = 13;
 
18683                            registers_per_thread_per_radix[2] = 0;
 
18684                            registers_per_thread_per_radix[3] = 15;
 
18685                            registers_per_thread_per_radix[5] = 15;
 
18686                            registers_per_thread_per_radix[7] = 21;
 
18687                            registers_per_thread_per_radix[11] = 0;
 
18688                            registers_per_thread_per_radix[13] = 0;
 
18693                    if (loc_multipliers[11] > 0) {
 
18694                        if (loc_multipliers[13] > 0) {
 
18695                            registers_per_thread_per_radix[2] = 0;
 
18696                            registers_per_thread_per_radix[3] = 15;
 
18697                            registers_per_thread_per_radix[5] = 15;
 
18698                            registers_per_thread_per_radix[7] = 0;
 
18699                            registers_per_thread_per_radix[11] = 11;
 
18700                            registers_per_thread_per_radix[13] = 13;
 
18703                            registers_per_thread_per_radix[2] = 0;
 
18704                            registers_per_thread_per_radix[3] = 15;
 
18705                            registers_per_thread_per_radix[5] = 15;
 
18706                            registers_per_thread_per_radix[7] = 0;
 
18707                            registers_per_thread_per_radix[11] = 11;
 
18708                            registers_per_thread_per_radix[13] = 0;
 
18712                        if (loc_multipliers[13] > 0) {
 
18713                            registers_per_thread_per_radix[2] = 0;
 
18714                            registers_per_thread_per_radix[3] = 15;
 
18715                            registers_per_thread_per_radix[5] = 15;
 
18716                            registers_per_thread_per_radix[7] = 0;
 
18717                            registers_per_thread_per_radix[11] = 0;
 
18718                            registers_per_thread_per_radix[13] = 13;
 
18721                            registers_per_thread_per_radix[2] = 0;
 
18722                            registers_per_thread_per_radix[3] = 15;
 
18723                            registers_per_thread_per_radix[5] = 15;
 
18724                            registers_per_thread_per_radix[7] = 0;
 
18725                            registers_per_thread_per_radix[11] = 0;
 
18726                            registers_per_thread_per_radix[13] = 0;
 
18733                if (loc_multipliers[7] > 0) {
 
18734                    if (loc_multipliers[3] == 1) {
 
18735                        if (loc_multipliers[11] > 0) {
 
18736                            if (loc_multipliers[13] > 0) {
 
18737                                registers_per_thread_per_radix[2] = 0;
 
18738                                registers_per_thread_per_radix[3] = 21;
 
18739                                registers_per_thread_per_radix[5] = 0;
 
18740                                registers_per_thread_per_radix[7] = 21;
 
18741                                registers_per_thread_per_radix[11] = 11;
 
18742                                registers_per_thread_per_radix[13] = 13;
 
18745                                registers_per_thread_per_radix[2] = 0;
 
18746                                registers_per_thread_per_radix[3] = 21;
 
18747                                registers_per_thread_per_radix[5] = 0;
 
18748                                registers_per_thread_per_radix[7] = 21;
 
18749                                registers_per_thread_per_radix[11] = 11;
 
18750                                registers_per_thread_per_radix[13] = 0;
 
18754                            if (loc_multipliers[13] > 0) {
 
18755                                registers_per_thread_per_radix[2] = 0;
 
18756                                registers_per_thread_per_radix[3] = 21;
 
18757                                registers_per_thread_per_radix[5] = 0;
 
18758                                registers_per_thread_per_radix[7] = 21;
 
18759                                registers_per_thread_per_radix[11] = 0;
 
18760                                registers_per_thread_per_radix[13] = 13;
 
18763                                registers_per_thread_per_radix[2] = 0;
 
18764                                registers_per_thread_per_radix[3] = 21;
 
18765                                registers_per_thread_per_radix[5] = 0;
 
18766                                registers_per_thread_per_radix[7] = 21;
 
18767                                registers_per_thread_per_radix[11] = 0;
 
18768                                registers_per_thread_per_radix[13] = 0;
 
18773                        if (loc_multipliers[11] > 0) {
 
18774                            if (loc_multipliers[13] > 0) {
 
18775                                registers_per_thread_per_radix[2] = 0;
 
18776                                registers_per_thread_per_radix[3] = 9;
 
18777                                registers_per_thread_per_radix[5] = 0;
 
18778                                registers_per_thread_per_radix[7] = 7;
 
18779                                registers_per_thread_per_radix[11] = 11;
 
18780                                registers_per_thread_per_radix[13] = 13;
 
18783                                registers_per_thread_per_radix[2] = 0;
 
18784                                registers_per_thread_per_radix[3] = 9;
 
18785                                registers_per_thread_per_radix[5] = 0;
 
18786                                registers_per_thread_per_radix[7] = 7;
 
18787                                registers_per_thread_per_radix[11] = 11;
 
18788                                registers_per_thread_per_radix[13] = 0;
 
18792                            if (loc_multipliers[13] > 0) {
 
18793                                registers_per_thread_per_radix[2] = 0;
 
18794                                registers_per_thread_per_radix[3] = 9;
 
18795                                registers_per_thread_per_radix[5] = 0;
 
18796                                registers_per_thread_per_radix[7] = 7;
 
18797                                registers_per_thread_per_radix[11] = 0;
 
18798                                registers_per_thread_per_radix[13] = 13;
 
18801                                registers_per_thread_per_radix[2] = 0;
 
18802                                registers_per_thread_per_radix[3] = 9;
 
18803                                registers_per_thread_per_radix[5] = 0;
 
18804                                registers_per_thread_per_radix[7] = 7;
 
18805                                registers_per_thread_per_radix[11] = 0;
 
18806                                registers_per_thread_per_radix[13] = 0;
 
18812                    if (loc_multipliers[3] == 1) {
 
18813                        if (loc_multipliers[11] > 0) {
 
18814                            if (loc_multipliers[13] > 0) {
 
18815                                registers_per_thread_per_radix[2] = 0;
 
18816                                registers_per_thread_per_radix[3] = 33;
 
18817                                registers_per_thread_per_radix[5] = 0;
 
18818                                registers_per_thread_per_radix[7] = 0;
 
18819                                registers_per_thread_per_radix[11] = 33;
 
18820                                registers_per_thread_per_radix[13] = 39;
 
18823                                registers_per_thread_per_radix[2] = 0;
 
18824                                registers_per_thread_per_radix[3] = 33;
 
18825                                registers_per_thread_per_radix[5] = 0;
 
18826                                registers_per_thread_per_radix[7] = 0;
 
18827                                registers_per_thread_per_radix[11] = 33;
 
18828                                registers_per_thread_per_radix[13] = 0;
 
18832                            if (loc_multipliers[13] > 0) {
 
18833                                registers_per_thread_per_radix[2] = 0;
 
18834                                registers_per_thread_per_radix[3] = 39;
 
18835                                registers_per_thread_per_radix[5] = 0;
 
18836                                registers_per_thread_per_radix[7] = 0;
 
18837                                registers_per_thread_per_radix[11] = 0;
 
18838                                registers_per_thread_per_radix[13] = 39;
 
18841                                registers_per_thread_per_radix[2] = 0;
 
18842                                registers_per_thread_per_radix[3] = 3;
 
18843                                registers_per_thread_per_radix[5] = 0;
 
18844                                registers_per_thread_per_radix[7] = 0;
 
18845                                registers_per_thread_per_radix[11] = 0;
 
18846                                registers_per_thread_per_radix[13] = 0;
 
18851                        if (loc_multipliers[11] > 0) {
 
18852                            if (loc_multipliers[13] > 0) {
 
18853                                registers_per_thread_per_radix[2] = 0;
 
18854                                registers_per_thread_per_radix[3] = 9;
 
18855                                registers_per_thread_per_radix[5] = 0;
 
18856                                registers_per_thread_per_radix[7] = 0;
 
18857                                registers_per_thread_per_radix[11] = 11;
 
18858                                registers_per_thread_per_radix[13] = 13;
 
18861                                registers_per_thread_per_radix[2] = 0;
 
18862                                registers_per_thread_per_radix[3] = 9;
 
18863                                registers_per_thread_per_radix[5] = 0;
 
18864                                registers_per_thread_per_radix[7] = 0;
 
18865                                registers_per_thread_per_radix[11] = 11;
 
18866                                registers_per_thread_per_radix[13] = 0;
 
18870                            if (loc_multipliers[13] > 0) {
 
18871                                registers_per_thread_per_radix[2] = 0;
 
18872                                registers_per_thread_per_radix[3] = 9;
 
18873                                registers_per_thread_per_radix[5] = 0;
 
18874                                registers_per_thread_per_radix[7] = 0;
 
18875                                registers_per_thread_per_radix[11] = 0;
 
18876                                registers_per_thread_per_radix[13] = 13;
 
18879                                registers_per_thread_per_radix[2] = 0;
 
18880                                registers_per_thread_per_radix[3] = 9;
 
18881                                registers_per_thread_per_radix[5] = 0;
 
18882                                registers_per_thread_per_radix[7] = 0;
 
18883                                registers_per_thread_per_radix[11] = 0;
 
18884                                registers_per_thread_per_radix[13] = 0;
 
18892            if (loc_multipliers[5] > 0) {
 
18893                if (loc_multipliers[7] > 0) {
 
18894                    if (loc_multipliers[11] > 0) {
 
18895                        if (loc_multipliers[13] > 0) {
 
18896                            registers_per_thread_per_radix[2] = 0;
 
18897                            registers_per_thread_per_radix[3] = 0;
 
18898                            registers_per_thread_per_radix[5] = 5;
 
18899                            registers_per_thread_per_radix[7] = 7;
 
18900                            registers_per_thread_per_radix[11] = 11;
 
18901                            registers_per_thread_per_radix[13] = 13;
 
18904                            registers_per_thread_per_radix[2] = 0;
 
18905                            registers_per_thread_per_radix[3] = 0;
 
18906                            registers_per_thread_per_radix[5] = 5;
 
18907                            registers_per_thread_per_radix[7] = 7;
 
18908                            registers_per_thread_per_radix[11] = 11;
 
18909                            registers_per_thread_per_radix[13] = 0;
 
18913                        if (loc_multipliers[13] > 0) {
 
18914                            registers_per_thread_per_radix[2] = 0;
 
18915                            registers_per_thread_per_radix[3] = 0;
 
18916                            registers_per_thread_per_radix[5] = 5;
 
18917                            registers_per_thread_per_radix[7] = 7;
 
18918                            registers_per_thread_per_radix[11] = 0;
 
18919                            registers_per_thread_per_radix[13] = 13;
 
18922                            registers_per_thread_per_radix[2] = 0;
 
18923                            registers_per_thread_per_radix[3] = 0;
 
18924                            registers_per_thread_per_radix[5] = 5;
 
18925                            registers_per_thread_per_radix[7] = 7;
 
18926                            registers_per_thread_per_radix[11] = 0;
 
18927                            registers_per_thread_per_radix[13] = 0;
 
18932                    if (loc_multipliers[11] > 0) {
 
18933                        if (loc_multipliers[13] > 0) {
 
18934                            registers_per_thread_per_radix[2] = 0;
 
18935                            registers_per_thread_per_radix[3] = 0;
 
18936                            registers_per_thread_per_radix[5] = 5;
 
18937                            registers_per_thread_per_radix[7] = 0;
 
18938                            registers_per_thread_per_radix[11] = 11;
 
18939                            registers_per_thread_per_radix[13] = 13;
 
18942                            registers_per_thread_per_radix[2] = 0;
 
18943                            registers_per_thread_per_radix[3] = 0;
 
18944                            registers_per_thread_per_radix[5] = 5;
 
18945                            registers_per_thread_per_radix[7] = 0;
 
18946                            registers_per_thread_per_radix[11] = 11;
 
18947                            registers_per_thread_per_radix[13] = 0;
 
18951                        if (loc_multipliers[13] > 0) {
 
18952                            registers_per_thread_per_radix[2] = 0;
 
18953                            registers_per_thread_per_radix[3] = 0;
 
18954                            registers_per_thread_per_radix[5] = 5;
 
18955                            registers_per_thread_per_radix[7] = 0;
 
18956                            registers_per_thread_per_radix[11] = 0;
 
18957                            registers_per_thread_per_radix[13] = 13;
 
18960                            registers_per_thread_per_radix[2] = 0;
 
18961                            registers_per_thread_per_radix[3] = 0;
 
18962                            registers_per_thread_per_radix[5] = 5;
 
18963                            registers_per_thread_per_radix[7] = 0;
 
18964                            registers_per_thread_per_radix[11] = 0;
 
18965                            registers_per_thread_per_radix[13] = 0;
 
18972                if (loc_multipliers[7] > 0) {
 
18973                    if (loc_multipliers[11] > 0) {
 
18974                        if (loc_multipliers[13] > 0) {
 
18975                            registers_per_thread_per_radix[2] = 0;
 
18976                            registers_per_thread_per_radix[3] = 0;
 
18977                            registers_per_thread_per_radix[5] = 0;
 
18978                            registers_per_thread_per_radix[7] = 7;
 
18979                            registers_per_thread_per_radix[11] = 11;
 
18980                            registers_per_thread_per_radix[13] = 13;
 
18983                            registers_per_thread_per_radix[2] = 0;
 
18984                            registers_per_thread_per_radix[3] = 0;
 
18985                            registers_per_thread_per_radix[5] = 0;
 
18986                            registers_per_thread_per_radix[7] = 7;
 
18987                            registers_per_thread_per_radix[11] = 11;
 
18988                            registers_per_thread_per_radix[13] = 0;
 
18992                        if (loc_multipliers[13] > 0) {
 
18993                            registers_per_thread_per_radix[2] = 0;
 
18994                            registers_per_thread_per_radix[3] = 0;
 
18995                            registers_per_thread_per_radix[5] = 0;
 
18996                            registers_per_thread_per_radix[7] = 7;
 
18997                            registers_per_thread_per_radix[11] = 0;
 
18998                            registers_per_thread_per_radix[13] = 13;
 
19001                            registers_per_thread_per_radix[2] = 0;
 
19002                            registers_per_thread_per_radix[3] = 0;
 
19003                            registers_per_thread_per_radix[5] = 0;
 
19004                            registers_per_thread_per_radix[7] = 7;
 
19005                            registers_per_thread_per_radix[11] = 0;
 
19006                            registers_per_thread_per_radix[13] = 0;
 
19011                    if (loc_multipliers[11] > 0) {
 
19012                        if (loc_multipliers[13] > 0) {
 
19013                            registers_per_thread_per_radix[2] = 0;
 
19014                            registers_per_thread_per_radix[3] = 0;
 
19015                            registers_per_thread_per_radix[5] = 0;
 
19016                            registers_per_thread_per_radix[7] = 0;
 
19017                            registers_per_thread_per_radix[11] = 11;
 
19018                            registers_per_thread_per_radix[13] = 13;
 
19021                            registers_per_thread_per_radix[2] = 0;
 
19022                            registers_per_thread_per_radix[3] = 0;
 
19023                            registers_per_thread_per_radix[5] = 0;
 
19024                            registers_per_thread_per_radix[7] = 0;
 
19025                            registers_per_thread_per_radix[11] = 11;
 
19026                            registers_per_thread_per_radix[13] = 0;
 
19030                        if (loc_multipliers[13] > 0) {
 
19031                            registers_per_thread_per_radix[2] = 0;
 
19032                            registers_per_thread_per_radix[3] = 0;
 
19033                            registers_per_thread_per_radix[5] = 0;
 
19034                            registers_per_thread_per_radix[7] = 0;
 
19035                            registers_per_thread_per_radix[11] = 0;
 
19036                            registers_per_thread_per_radix[13] = 13;
 
19047    for (uint64_t i = 0; i < 14; i++) {
 
19048        if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i];
 
19049        if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i];
 
19051    if ((registers_per_thread[0] > 10) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0;
 
19052    else isGoodSequence[0] = 1;
 
 
19059    uint64_t complexSize;
 
19061        complexSize = (2 * 
sizeof(double));
 
19064            complexSize = (2 * 
sizeof(float));
 
19066            complexSize = (2 * 
sizeof(float));
 
19068    uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
 
19070    for (uint64_t i = 0; i < 3; i++) {
 
19089    if (axis_id != nonStridedAxisId) {
 
19093    uint64_t multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 
19095    for (uint64_t i = 2; i < 14; i++) {
 
19096        if (tempSequence % i == 0) {
 
19102    if (tempSequence != 1) {
 
19104        if (axis_id != nonStridedAxisId) {
 
19110        uint64_t FFTSizeSelected = 0;
 
19112            while (!FFTSizeSelected) {
 
19113                uint64_t testSequence = tempSequence;
 
19114                for (uint64_t i = 0; i < 20; i++) {
 
19115                    multipliers[i] = 0;
 
19118                    if (testSequence % i == 0) {
 
19124                if (testSequence == 1) FFTSizeSelected = 1;
 
19125                else tempSequence++;
 
19129            while (!FFTSizeSelected) {
 
19130                if (axis_id == nonStridedAxisId) {
 
19131                    if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
 
19135                    if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
 
19137                uint64_t testSequence = tempSequence;
 
19138                for (uint64_t i = 0; i < 20; i++) {
 
19139                    multipliers[i] = 0;
 
19141                for (uint64_t i = 2; i < 8; i++) {
 
19142                    if (testSequence % i == 0) {
 
19148                if (testSequence != 1) tempSequence++;
 
19150                    uint64_t registers_per_thread_per_radix[14];
 
19151                    uint64_t registers_per_thread = 0;
 
19152                    uint64_t min_registers_per_thread = -1;
 
19153                    uint64_t isGoodSequence = 0;
 
19154                    res = 
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
 
19156                    if (isGoodSequence) FFTSizeSelected = 1;
 
19157                    else tempSequence++;
 
19168            uint64_t FFTSizeSelected = 0;
 
19170                while (!FFTSizeSelected) {
 
19171                    uint64_t testSequence = tempSequence;
 
19172                    for (uint64_t i = 0; i < 20; i++) {
 
19173                        multipliers[i] = 0;
 
19176                        if (testSequence % i == 0) {
 
19182                    if (testSequence == 1) FFTSizeSelected = 1;
 
19183                    else tempSequence++;
 
19187                while (!FFTSizeSelected) {
 
19188                    if (axis_id == nonStridedAxisId) {
 
19189                        if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory))))  tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
 
19193                        if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp))))  tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
 
19195                    uint64_t testSequence = tempSequence;
 
19196                    for (uint64_t i = 0; i < 20; i++) {
 
19197                        multipliers[i] = 0;
 
19199                    for (uint64_t i = 2; i < 8; i++) {
 
19200                        if (testSequence % i == 0) {
 
19206                    if (testSequence != 1) tempSequence++;
 
19208                        uint64_t registers_per_thread_per_radix[14];
 
19209                        uint64_t registers_per_thread = 0;
 
19210                        uint64_t min_registers_per_thread = -1;
 
19211                        uint64_t isGoodSequence = 0;
 
19212                        res = 
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
 
19214                        if (isGoodSequence) FFTSizeSelected = 1;
 
19215                        else tempSequence++;
 
19224            maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
 
19248    uint64_t registerBoost = 1;
 
19255    uint64_t maxSingleSizeStrided = (!app->
configuration.
performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided;
 
19256    uint64_t numPasses = 1;
 
19257    uint64_t numPassesHalfBandwidth = 1;
 
19259    temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStrided);
 
19270            numPasses = (uint64_t)ceil(log2(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
 
19272            numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided));
 
19274    registerBoost = ((axis_id == nonStridedAxisId) && ((app->
useBluesteinFFT[axis_id]) || (!app->
configuration.
reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)pow(maxSequenceLengthSharedMemoryStrided, numPasses));
 
19275    uint64_t canBoost = 0;
 
19287    maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost;
 
19288    maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
 
19289    uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
 
19292        temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStridedHalfBandwidth);
 
19296            for (uint64_t i = 0; i < 5; i++) {
 
19297                temp = (uint64_t)ceil(temp / (
double)maxSingleSizeStrided);
 
19298                numPassesHalfBandwidth++;
 
19299                if (temp == 1) i = 5;
 
19311        if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth;
 
19312        else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
 
19315    uint64_t* locAxisSplit = FFTPlan->
axisSplit[axis_id];
 
19316    if (numPasses == 1) {
 
19319    if (numPasses == 2) {
 
19322                uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
 
19324                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) {
 
19325                    locAxisSplit[0] = maxPow8SharedMemory;
 
19328                    if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) {
 
19329                        locAxisSplit[0] = maxSequenceLengthSharedMemory;
 
19332                        if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) {
 
19333                            for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) {
 
19334                                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided) {
 
19335                                    locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
 
19336                                    i = (uint64_t)log2(registerBoost) + 1;
 
19341                            locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
 
19347                uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
 
19349                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) {
 
19350                    locAxisSplit[0] = maxPow8Strided;
 
19353                    if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) {
 
19354                        locAxisSplit[0] = maxSingleSizeStrided;
 
19357                        locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
 
19362            if (locAxisSplit[1] < 64) {
 
19363                locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]);
 
19364                locAxisSplit[1] = 64;
 
19366            if (locAxisSplit[1] > locAxisSplit[0]) {
 
19367                uint64_t swap = locAxisSplit[0];
 
19368                locAxisSplit[0] = locAxisSplit[1];
 
19369                locAxisSplit[1] = swap;
 
19373            uint64_t successSplit = 0;
 
19385                uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]));
 
19386                for (uint64_t i = 0; i < sqrtSequence; i++) {
 
19388                        if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) {
 
19390                            locAxisSplit[1] = sqrtSequence - i;
 
19398                uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]));
 
19399                for (uint64_t i = 0; i < sqrtSequence; i++) {
 
19401                        if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) {
 
19403                            locAxisSplit[1] = sqrtSequence - i;
 
19410            if (successSplit == 0)
 
19414    if (numPasses == 3) {
 
19416            uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
 
19419                uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
 
19420                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided)
 
19421                    locAxisSplit[0] = maxPow8SharedMemory;
 
19423                    if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided)
 
19424                        locAxisSplit[0] = maxSequenceLengthSharedMemory;
 
19426                        if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) {
 
19427                            for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) {
 
19428                                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
 
19429                                    locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
 
19430                                    i = (uint64_t)log2(registerBoost) + 1;
 
19435                            locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
 
19449                uint64_t maxPow8_128 = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3);
 
19451                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided)
 
19452                    locAxisSplit[0] = maxPow8_128;
 
19456                    if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) {
 
19457                        locAxisSplit[0] = maxPow8_128 * 2;
 
19460                        if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) {
 
19461                            locAxisSplit[0] = maxPow8_128 * 4;
 
19464                            if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) {
 
19465                                for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) {
 
19466                                    if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
 
19467                                        locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)pow(2, i));
 
19468                                        i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1;
 
19473                                locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
 
19478            if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) {
 
19479                locAxisSplit[1] = maxPow8Strided;
 
19480                locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
 
19483                if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) {
 
19484                    locAxisSplit[1] = maxSingleSizeStrided;
 
19485                    locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
 
19488                    locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth;
 
19489                    locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
 
19492            if (locAxisSplit[2] < 64) {
 
19493                locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]);
 
19494                locAxisSplit[2] = 64;
 
19496            if (locAxisSplit[2] > locAxisSplit[1]) {
 
19497                uint64_t swap = locAxisSplit[1];
 
19498                locAxisSplit[1] = locAxisSplit[2];
 
19499                locAxisSplit[2] = swap;
 
19503            uint64_t successSplit = 0;
 
19505                for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) {
 
19506                    if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
 
19507                        uint64_t sqrt3Sequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
 
19508                        for (uint64_t j = 0; j < sqrt3Sequence; j++) {
 
19509                            if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) {
 
19510                                if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) {
 
19511                                    locAxisSplit[0] = (maxSequenceLengthSharedMemory - i);
 
19512                                    locAxisSplit[1] = sqrt3Sequence - j;
 
19513                                    locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j);
 
19514                                    i = maxSequenceLengthSharedMemory;
 
19524                uint64_t sqrt3Sequence = (uint64_t)ceil(pow(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0));
 
19525                for (uint64_t i = 0; i < sqrt3Sequence; i++) {
 
19527                        uint64_t sqrt2Sequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
 
19528                        for (uint64_t j = 0; j < sqrt2Sequence; j++) {
 
19529                            if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) {
 
19530                                if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) {
 
19531                                    locAxisSplit[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j);
 
19532                                    locAxisSplit[1] = sqrt3Sequence - i;
 
19533                                    locAxisSplit[2] = sqrt2Sequence - j;
 
19543            if (successSplit == 0)
 
19547    if (numPasses > 3) {
 
19560        for (uint64_t i = 0; i < numPasses; i++) {
 
19561            if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) {
 
19562                uint64_t swap = locAxisSplit[0];
 
19563                locAxisSplit[0] = locAxisSplit[i];
 
19564                locAxisSplit[i] = swap;
 
19567        for (uint64_t i = 0; i < numPasses; i++) {
 
19568            if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) {
 
19569                uint64_t swap = locAxisSplit[0];
 
19570                locAxisSplit[0] = locAxisSplit[i];
 
19571                locAxisSplit[i] = swap;
 
19574        for (uint64_t i = 0; i < numPasses; i++) {
 
19575            if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) {
 
19576                uint64_t swap = locAxisSplit[0];
 
19577                locAxisSplit[0] = locAxisSplit[i];
 
19578                locAxisSplit[i] = swap;
 
19583    for (uint64_t k = 0; k < numPasses; k++) {
 
19584        tempSequence = locAxisSplit[k];
 
19585        uint64_t loc_multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; 
 
19586        for (uint64_t i = 2; i < 14; i++) {
 
19587            if (tempSequence % i == 0) {
 
19589                loc_multipliers[i]++;
 
19593        uint64_t registers_per_thread_per_radix[14];
 
19594        uint64_t registers_per_thread = 0;
 
19595        uint64_t min_registers_per_thread = -1;
 
19596        uint64_t isGoodSequence = 0;
 
19597        res = 
VkFFTGetRegistersPerThread(loc_multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
 
19599        registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2];
 
19600        registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2];
 
19601        if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
 
19602            registers_per_thread *= 2;
 
19603            for (uint64_t i = 2; i < 14; i++) {
 
19604                registers_per_thread_per_radix[i] *= 2;
 
19606            min_registers_per_thread *= 2;
 
19608        if (registers_per_thread_per_radix[8] % 8 == 0) {
 
19609            loc_multipliers[8] = loc_multipliers[2] / 3;
 
19610            loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
 
19612        if (registers_per_thread_per_radix[4] % 4 == 0) {
 
19613            loc_multipliers[4] = loc_multipliers[2] / 2;
 
19614            loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
 
19616        if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
 
19617            if (loc_multipliers[4] > 0) {
 
19618                loc_multipliers[4]--;
 
19619                loc_multipliers[2] = 2;
 
19622                loc_multipliers[8]--;
 
19623                loc_multipliers[4]++;
 
19624                loc_multipliers[2]++;
 
19627        if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
 
19628            loc_multipliers[8]--;
 
19629            loc_multipliers[4]++;
 
19630            loc_multipliers[2]++;
 
19635            uint64_t scaleRegistersNum = 1;
 
19636            while ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->
configuration.
maxThreadsNum) {
 
19637                for (uint64_t i = 2; i < 14; i++) {
 
19638                    if (locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum) % i == 0) {
 
19639                        scaleRegistersNum *= i;
 
19644            min_registers_per_thread *= scaleRegistersNum;
 
19645            uint64_t temp_scaleRegistersNum = scaleRegistersNum;
 
19646            while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
 
19647            registers_per_thread *= temp_scaleRegistersNum;
 
19648            for (uint64_t i = 2; i < 14; i++) {
 
19649                if (registers_per_thread_per_radix[i] != 0) {
 
19650                    temp_scaleRegistersNum = scaleRegistersNum;
 
19651                    while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread_per_radix[i] * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
 
19652                    registers_per_thread_per_radix[i] *= temp_scaleRegistersNum;
 
19656            if (min_registers_per_thread > registers_per_thread) {
 
19657                uint64_t temp = min_registers_per_thread;
 
19658                min_registers_per_thread = registers_per_thread;
 
19659                registers_per_thread = temp;
 
19661            for (uint64_t i = 2; i < 14; i++) {
 
19662                if (registers_per_thread_per_radix[i] > registers_per_thread) {
 
19663                    registers_per_thread = registers_per_thread_per_radix[i];
 
19665                if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) {
 
19666                    min_registers_per_thread = registers_per_thread_per_radix[i];
 
19674        for (uint64_t i = 2; i < 14; i++) {
 
19679        uint64_t tempRegisterBoost = registerBoost;
 
19680        uint64_t switchRegisterBoost = 0;
 
19681        if (tempRegisterBoost > 1) {
 
19682            if (loc_multipliers[tempRegisterBoost] > 0) {
 
19683                loc_multipliers[tempRegisterBoost]--;
 
19684                switchRegisterBoost = tempRegisterBoost;
 
19687                for (uint64_t i = 14; i > 1; i--) {
 
19688                    if (loc_multipliers[i] > 0) {
 
19689                        loc_multipliers[i]--;
 
19690                        switchRegisterBoost = i;
 
19696        for (uint64_t i = 14; i > 1; i--) {
 
19697            if (loc_multipliers[i] > 0) {
 
19699                loc_multipliers[i]--;
 
19705        if (switchRegisterBoost > 0) {
 
19710            if (min_registers_per_thread != registers_per_thread) {
 
 
19726    double double_PI = 3.1415926535897932384626433832795;
 
19731    kernelPreparationConfiguration.
FFTdim = 1;
 
19733    kernelPreparationConfiguration.
size[1] = 1;
 
19734    kernelPreparationConfiguration.
size[2] = 1;
 
19736    kernelPreparationConfiguration.
useLUT = 1;
 
19749#if(VKFFT_BACKEND==0) 
19756#elif(VKFFT_BACKEND==3) 
19757    kernelPreparationConfiguration.platform = app->
configuration.platform;
 
19758    kernelPreparationConfiguration.context = app->
configuration.context;
 
19761    uint64_t bufferSize = (uint64_t)
sizeof(
float) * 2 * kernelPreparationConfiguration.
size[0] * kernelPreparationConfiguration.
size[1] * kernelPreparationConfiguration.
size[2];
 
19762    if (kernelPreparationConfiguration.
doublePrecision) bufferSize *= 
sizeof(double) / 
sizeof(
float);
 
19768    resFFT = 
initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration);
 
19771#if(VKFFT_BACKEND==0) 
19772    VkResult res = VK_SUCCESS;
 
19783#elif(VKFFT_BACKEND==1) 
19784    cudaError_t res = cudaSuccess;
 
19785    res = cudaMalloc((
void**)&app->
bufferBluestein[axis_id], bufferSize);
 
19795#elif(VKFFT_BACKEND==2) 
19796    hipError_t res = hipSuccess;
 
19807#elif(VKFFT_BACKEND==3) 
19808    cl_int res = CL_SUCCESS;
 
19822    void* phaseVectors = malloc(bufferSize);
 
19823    if (!phaseVectors) {
 
19832            double* phaseVectors_cast = (
double*)phaseVectors;
 
19834                uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 
19835                double angle = double_PI * rm / phaseVectorsNonZeroSize;
 
19836                phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)cos(angle) : 0;
 
19837                phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)-sin(angle) : 0;
 
19839            for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
 
19840                phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 
19841                phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 
19845            float* phaseVectors_cast = (
float*)phaseVectors;
 
19847                uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 
19848                double angle = double_PI * rm / phaseVectorsNonZeroSize;
 
19849                phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)cos(angle) : 0;
 
19850                phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)-sin(angle) : 0;
 
19852            for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
 
19853                phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 
19854                phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 
19857#if(VKFFT_BACKEND==0) 
19860            free(phaseVectors);
 
19864#elif(VKFFT_BACKEND==1) 
19865        res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
 
19866        if (res != cudaSuccess) {
 
19867            free(phaseVectors);
 
19871#elif(VKFFT_BACKEND==2) 
19872        res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
 
19873        if (res != hipSuccess) {
 
19874            free(phaseVectors);
 
19878#elif(VKFFT_BACKEND==3) 
19879        res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
 
19880        if (res != CL_SUCCESS) {
 
19881            free(phaseVectors);
 
19886#if(VKFFT_BACKEND==0) 
19888            VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 
19890            commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
 
19891            commandBufferAllocateInfo.commandBufferCount = 1;
 
19892            VkCommandBuffer commandBuffer = {};
 
19893            res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
 
19895                free(phaseVectors);
 
19899            VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
 
19900            commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
 
19901            res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
 
19903                free(phaseVectors);
 
19912            resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
19914                free(phaseVectors);
 
19918            res = vkEndCommandBuffer(commandBuffer);
 
19920                free(phaseVectors);
 
19924            VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
 
19925            submitInfo.commandBufferCount = 1;
 
19926            submitInfo.pCommandBuffers = &commandBuffer;
 
19929                free(phaseVectors);
 
19935                free(phaseVectors);
 
19941                free(phaseVectors);
 
19947#elif(VKFFT_BACKEND==1) 
19951        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
19953            free(phaseVectors);
 
19957        res = cudaDeviceSynchronize();
 
19958        if (res != cudaSuccess) {
 
19959            free(phaseVectors);
 
19963#elif(VKFFT_BACKEND==2) 
19967        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
19969            free(phaseVectors);
 
19973        res = hipDeviceSynchronize();
 
19974        if (res != hipSuccess) {
 
19975            free(phaseVectors);
 
19979#elif(VKFFT_BACKEND==3) 
19981        launchParams.commandQueue = &commandQueue;
 
19984        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
19986            free(phaseVectors);
 
19990        res = clFinish(commandQueue);
 
19991        if (res != CL_SUCCESS) {
 
19992            free(phaseVectors);
 
19999        double* phaseVectors_cast = (
double*)phaseVectors;
 
20001            uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 
20002            double angle = double_PI * rm / phaseVectorsNonZeroSize;
 
20003            phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)cos(angle) : 0;
 
20004            phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)sin(angle) : 0;
 
20006        for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
 
20007            phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 
20008            phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 
20012        float* phaseVectors_cast = (
float*)phaseVectors;
 
20014            uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
 
20015            double angle = double_PI * rm / phaseVectorsNonZeroSize;
 
20016            phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)cos(angle) : 0;
 
20017            phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)sin(angle) : 0;
 
20019        for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
 
20020            phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
 
20021            phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
 
20024#if(VKFFT_BACKEND==0) 
20027        free(phaseVectors);
 
20031#elif(VKFFT_BACKEND==1) 
20032    res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
 
20033    if (res != cudaSuccess) {
 
20034        free(phaseVectors);
 
20038#elif(VKFFT_BACKEND==2) 
20039    res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
 
20040    if (res != hipSuccess) {
 
20041        free(phaseVectors);
 
20045#elif(VKFFT_BACKEND==3) 
20046    res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
 
20047    if (res != CL_SUCCESS) {
 
20048        free(phaseVectors);
 
20053#if(VKFFT_BACKEND==0) 
20055        VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 
20057        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
 
20058        commandBufferAllocateInfo.commandBufferCount = 1;
 
20059        VkCommandBuffer commandBuffer = {};
 
20060        res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
 
20062            free(phaseVectors);
 
20066        VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
 
20067        commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
 
20068        res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
 
20070            free(phaseVectors);
 
20079        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
20081            free(phaseVectors);
 
20085        res = vkEndCommandBuffer(commandBuffer);
 
20087            free(phaseVectors);
 
20091        VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
 
20092        submitInfo.commandBufferCount = 1;
 
20093        submitInfo.pCommandBuffers = &commandBuffer;
 
20096            free(phaseVectors);
 
20102            free(phaseVectors);
 
20108            free(phaseVectors);
 
20115        VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 
20117        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
 
20118        commandBufferAllocateInfo.commandBufferCount = 1;
 
20119        VkCommandBuffer commandBuffer = {};
 
20120        res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
 
20122            free(phaseVectors);
 
20126        VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
 
20127        commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
 
20128        res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
 
20130            free(phaseVectors);
 
20139        resFFT = 
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
 
20141            free(phaseVectors);
 
20145        res = vkEndCommandBuffer(commandBuffer);
 
20147            free(phaseVectors);
 
20151        VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
 
20152        submitInfo.commandBufferCount = 1;
 
20153        submitInfo.pCommandBuffers = &commandBuffer;
 
20156            free(phaseVectors);
 
20162            free(phaseVectors);
 
20168            free(phaseVectors);
 
20174#elif(VKFFT_BACKEND==1) 
20179        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
20181            free(phaseVectors);
 
20185        res = cudaDeviceSynchronize();
 
20186        if (res != cudaSuccess) {
 
20187            free(phaseVectors);
 
20194        resFFT = 
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
 
20196            free(phaseVectors);
 
20200        res = cudaDeviceSynchronize();
 
20201        if (res != cudaSuccess) {
 
20202            free(phaseVectors);
 
20207#elif(VKFFT_BACKEND==2) 
20212        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
20214            free(phaseVectors);
 
20218        res = hipDeviceSynchronize();
 
20219        if (res != hipSuccess) {
 
20220            free(phaseVectors);
 
20227        resFFT = 
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
 
20229            free(phaseVectors);
 
20233        res = hipDeviceSynchronize();
 
20234        if (res != hipSuccess) {
 
20235            free(phaseVectors);
 
20240#elif(VKFFT_BACKEND==3) 
20242    launchParams.commandQueue = &commandQueue;
 
20246        resFFT = 
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
 
20248            free(phaseVectors);
 
20252        res = clFinish(commandQueue);
 
20253        if (res != CL_SUCCESS) {
 
20254            free(phaseVectors);
 
20261        resFFT = 
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
 
20263            free(phaseVectors);
 
20267        res = clFinish(commandQueue);
 
20268        if (res != CL_SUCCESS) {
 
20269            free(phaseVectors);
 
20275    free(phaseVectors);
 
20276#if(VKFFT_BACKEND==0) 
20278#elif(VKFFT_BACKEND==3) 
20279    res = clReleaseCommandQueue(commandQueue);
 
 
21061#if(VKFFT_BACKEND==0) 
21062    VkResult res = VK_SUCCESS;
 
21063#elif(VKFFT_BACKEND==1) 
21064    cudaError_t res = cudaSuccess;
 
21065#elif(VKFFT_BACKEND==2) 
21066    hipError_t res = hipSuccess;
 
21067#elif(VKFFT_BACKEND==3) 
21068    cl_int res = CL_SUCCESS;
 
21074    axis->specializationConstants.numAxisUploads = FFTPlan->
numAxisUploads[0];
 
21075    uint64_t complexSize;
 
21077        complexSize = (2 * 
sizeof(double));
 
21080            complexSize = (2 * 
sizeof(float));
 
21082            complexSize = (2 * 
sizeof(float));
 
21083    axis->specializationConstants.complexSize = complexSize;
 
21084    axis->specializationConstants.supportAxis = 0;
 
21089    axis->specializationConstants.dispatchZactualFFTSize = 1;
 
21092        double double_PI = 3.1415926535897932384626433832795;
 
21095            double* tempLUT = (
double*)malloc(axis->bufferLUTSize);
 
21102                tempLUT[2 * i] = (double)cos(angle);
 
21103                tempLUT[2 * i + 1] = (double)sin(angle);
 
21105            axis->referenceLUT = 0;
 
21108#if(VKFFT_BACKEND==0) 
21112                axis->referenceLUT = 1;
 
21115#if(VKFFT_BACKEND==0) 
21116                resFFT = 
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
 
21130#elif(VKFFT_BACKEND==1) 
21131                res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
21132                if (res != cudaSuccess) {
 
21138                res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
 
21139                if (res != cudaSuccess) {
 
21145#elif(VKFFT_BACKEND==2) 
21146                res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
21147                if (res != hipSuccess) {
 
21153                res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
 
21154                if (res != hipSuccess) {
 
21160#elif(VKFFT_BACKEND==3) 
21161                axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
 
21162                if (res != CL_SUCCESS) {
 
21175            float* tempLUT = (
float*)malloc(axis->bufferLUTSize);
 
21182                tempLUT[2 * i] = (float)cos(angle);
 
21183                tempLUT[2 * i + 1] = (float)sin(angle);
 
21185            axis->referenceLUT = 0;
 
21188#if(VKFFT_BACKEND==0) 
21192                axis->referenceLUT = 1;
 
21195#if(VKFFT_BACKEND==0) 
21196                resFFT = 
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
 
21210#elif(VKFFT_BACKEND==1) 
21211                res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
21212                if (res != cudaSuccess) {
 
21218                res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
 
21219                if (res != cudaSuccess) {
 
21225#elif(VKFFT_BACKEND==2) 
21226                res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
21227                if (res != hipSuccess) {
 
21233                res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
 
21234                if (res != hipSuccess) {
 
21240#elif(VKFFT_BACKEND==3) 
21241                axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
 
21242                if (res != CL_SUCCESS) {
 
21255    uint64_t* axisStride = axis->specializationConstants.inputStride;
 
21256    uint64_t* usedStride = 0;
 
21269    axisStride[0] = usedStride[0];
 
21270    axisStride[1] = usedStride[1];
 
21271    axisStride[2] = usedStride[2];
 
21272    axisStride[3] = usedStride[3];
 
21273    axisStride[4] = usedStride[4];
 
21275    axisStride = axis->specializationConstants.outputStride;
 
21276    usedStride = axis->specializationConstants.inputStride;
 
21278    axisStride[0] = usedStride[0];
 
21279    axisStride[1] = usedStride[1];
 
21280    axisStride[2] = usedStride[2];
 
21281    axisStride[3] = usedStride[3];
 
21282    axisStride[4] = usedStride[4];
 
21284    axis->specializationConstants.inverse = inverse;
 
21286    uint64_t storageComplexSize;
 
21288        storageComplexSize = (2 * 
sizeof(double));
 
21291            storageComplexSize = (2 * 2);
 
21293            storageComplexSize = (2 * 
sizeof(float));
 
21295    uint64_t initPageSize = -1;
 
21296    uint64_t locBufferNum = 1;
 
21297    uint64_t locBufferSize = 0;
 
21311    uint64_t axis_id = 0;
 
21312    uint64_t axis_upload_id = 0;
 
21315        uint64_t totalSize = 0;
 
21316        uint64_t locPageSize = initPageSize;
 
21319                ((axis_id == app->
firstAxis) && (!inverse))
 
21322                uint64_t totalSize = 0;
 
21323                uint64_t locPageSize = initPageSize;
 
21332                axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21333                axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
21339                    uint64_t totalSize = 0;
 
21340                    uint64_t locPageSize = initPageSize;
 
21349                    axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21350                    axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
21355                    uint64_t totalSize = 0;
 
21356                    uint64_t locPageSize = initPageSize;
 
21366                    axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21367                    axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
21375                ((axis_id == app->
firstAxis) && (inverse))
 
21380                    ((axis_id == app->
firstAxis) && (inverse))
 
21387                uint64_t totalSize = 0;
 
21388                uint64_t locPageSize = initPageSize;
 
21397                axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21398                axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
21403                uint64_t totalSize = 0;
 
21404                uint64_t locPageSize = initPageSize;
 
21414                axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21415                axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
21423    locBufferSize = -1;
 
21427                uint64_t totalSize = 0;
 
21428                uint64_t locPageSize = initPageSize;
 
21437                axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21438                axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
21443                uint64_t totalSize = 0;
 
21444                uint64_t locPageSize = initPageSize;
 
21454                axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21455                axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
21462                ((axis_id == app->
firstAxis) && (inverse))
 
21467                    ((axis_id == app->
firstAxis) && (inverse))
 
21474                uint64_t totalSize = 0;
 
21475                uint64_t locPageSize = initPageSize;
 
21484                axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21485                axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
21490                uint64_t totalSize = 0;
 
21491                uint64_t locPageSize = initPageSize;
 
21501                axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21502                axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
21509    if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
 
21510    if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
 
21513        uint64_t totalSize = 0;
 
21514        uint64_t locPageSize = initPageSize;
 
21521        axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
21522        axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
 
21524        if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
 
21527        axis->specializationConstants.kernelBlockSize = 0;
 
21528        axis->specializationConstants.kernelBlockNum = 0;
 
21530    axis->numBindings = 2;
 
21531    axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
 
21532    axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
 
21533    axis->specializationConstants.numBuffersBound[2] = 0;
 
21534    axis->specializationConstants.numBuffersBound[3] = 0;
 
21536#if(VKFFT_BACKEND==0) 
21537    VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
 
21538    descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]);
 
21541        axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
 
21542#if(VKFFT_BACKEND==0) 
21543        descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
 
21545        axis->numBindings++;
 
21549        axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
 
21550#if(VKFFT_BACKEND==0) 
21551        descriptorPoolSize.descriptorCount++;
 
21553        axis->numBindings++;
 
21555#if(VKFFT_BACKEND==0) 
21556    VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
 
21557    descriptorPoolCreateInfo.poolSizeCount = 1;
 
21558    descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
 
21559    descriptorPoolCreateInfo.maxSets = 1;
 
21560    res = vkCreateDescriptorPool(app->
configuration.
device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
 
21561    if (res != VK_SUCCESS) {
 
21565    const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 
21566    VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
 
21567    descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * 
sizeof(VkDescriptorSetLayoutBinding));
 
21568    if (!descriptorSetLayoutBindings) {
 
21572    for (uint64_t i = 0; i < axis->numBindings; ++i) {
 
21573        descriptorSetLayoutBindings[i].binding = (uint32_t)i;
 
21574        descriptorSetLayoutBindings[i].descriptorType = descriptorType;
 
21575        descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
 
21576        descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
 
21579    VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
 
21580    descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
 
21581    descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
 
21583    res = vkCreateDescriptorSetLayout(app->
configuration.
device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
 
21584    if (res != VK_SUCCESS) {
 
21588    free(descriptorSetLayoutBindings);
 
21589    descriptorSetLayoutBindings = 0;
 
21590    VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
 
21591    descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
 
21592    descriptorSetAllocateInfo.descriptorSetCount = 1;
 
21593    descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
 
21594    res = vkAllocateDescriptorSets(app->
configuration.
device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
 
21595    if (res != VK_SUCCESS) {
 
21611#if(VKFFT_BACKEND==0) 
21612        VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
 
21613        pipelineLayoutCreateInfo.setLayoutCount = 1;
 
21614        pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
 
21616        VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
 
21617        pushConstantRange.offset = 0;
 
21620        pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
 
21621        pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
 
21623        res = vkCreatePipelineLayout(app->
configuration.
device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
 
21624        if (res != VK_SUCCESS) {
 
21629        axis->axisBlock[0] = 128;
 
21631        axis->axisBlock[1] = 1;
 
21632        axis->axisBlock[2] = 1;
 
21638        else  axis->specializationConstants.performWorkGroupShift[0] = 0;
 
21640        else  axis->specializationConstants.performWorkGroupShift[1] = 0;
 
21642        else  axis->specializationConstants.performWorkGroupShift[2] = 0;
 
21644        axis->specializationConstants.localSize[0] = axis->axisBlock[0];
 
21645        axis->specializationConstants.localSize[1] = axis->axisBlock[1];
 
21646        axis->specializationConstants.localSize[2] = axis->axisBlock[2];
 
21663        axis->specializationConstants.axis_id = 0;
 
21664        axis->specializationConstants.axis_upload_id = 0;
 
21666        for (uint64_t i = 0; i < 3; i++) {
 
21679                axis->specializationConstants.zeropad[0] = 0;
 
21686                axis->specializationConstants.zeropad[1] = 0;
 
21695                axis->specializationConstants.zeropad[0] = 0;
 
21702                axis->specializationConstants.zeropad[1] = 0;
 
21705            axis->specializationConstants.convolutionStep = 1;
 
21708            axis->specializationConstants.convolutionStep = 0;
 
21709        char floatTypeInputMemory[10];
 
21710        char floatTypeOutputMemory[10];
 
21711        char floatTypeKernelMemory[10];
 
21712        char floatType[10];
 
21713        axis->specializationConstants.unroll = 1;
 
21716            sprintf(floatType, 
"double");
 
21717            sprintf(floatTypeInputMemory, 
"double");
 
21718            sprintf(floatTypeOutputMemory, 
"double");
 
21719            sprintf(floatTypeKernelMemory, 
"double");
 
21725                sprintf(floatType, 
"float");
 
21728                    sprintf(floatTypeInputMemory, 
"float");
 
21729                    sprintf(floatTypeOutputMemory, 
"float");
 
21730                    sprintf(floatTypeKernelMemory, 
"float");
 
21733                    sprintf(floatTypeInputMemory, 
"half");
 
21734                    sprintf(floatTypeOutputMemory, 
"half");
 
21735                    sprintf(floatTypeKernelMemory, 
"half");
 
21741                    sprintf(floatType, 
"double");
 
21742                    sprintf(floatTypeInputMemory, 
"float");
 
21743                    sprintf(floatTypeOutputMemory, 
"float");
 
21744                    sprintf(floatTypeKernelMemory, 
"float");
 
21747                    sprintf(floatType, 
"float");
 
21748                    sprintf(floatTypeInputMemory, 
"float");
 
21749                    sprintf(floatTypeOutputMemory, 
"float");
 
21750                    sprintf(floatTypeKernelMemory, 
"float");
 
21754        char uintType[20] = 
"";
 
21756#if(VKFFT_BACKEND==0) 
21757            sprintf(uintType, 
"uint");
 
21758#elif(VKFFT_BACKEND==1) 
21759            sprintf(uintType, 
"unsigned int");
 
21760#elif(VKFFT_BACKEND==2) 
21761            sprintf(uintType, 
"unsigned int");
 
21762#elif(VKFFT_BACKEND==3) 
21763            sprintf(uintType, 
"unsigned int");
 
21767#if(VKFFT_BACKEND==0) 
21768            sprintf(uintType, 
"uint64_t");
 
21769#elif(VKFFT_BACKEND==1) 
21770            sprintf(uintType, 
"unsigned long long");
 
21771#elif(VKFFT_BACKEND==2) 
21772            sprintf(uintType, 
"unsigned long long");
 
21773#elif(VKFFT_BACKEND==3) 
21774            sprintf(uintType, 
"unsigned long");
 
21783        char* code0 = axis->specializationConstants.code0;
 
21788        resFFT = 
shaderGenVkFFT_R2C_decomposition(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
 
21794#if(VKFFT_BACKEND==0) 
21795        const glslang_resource_t default_resource = {
 
21901        glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
 
21902        glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
 
21903        const glslang_input_t input =
 
21905            GLSLANG_SOURCE_GLSL,
 
21906            GLSLANG_STAGE_COMPUTE,
 
21907            GLSLANG_CLIENT_VULKAN,
 
21909            GLSLANG_TARGET_SPV,
 
21910            target_language_version,
 
21913            GLSLANG_NO_PROFILE,
 
21916            GLSLANG_MSG_DEFAULT_BIT,
 
21920        glslang_shader_t* shader = glslang_shader_create(&input);
 
21922        if (!glslang_shader_preprocess(shader, &input))
 
21924            err = glslang_shader_get_info_log(shader);
 
21925            printf(
"%s\n", code0);
 
21926            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
21927            glslang_shader_delete(shader);
 
21935        if (!glslang_shader_parse(shader, &input))
 
21937            err = glslang_shader_get_info_log(shader);
 
21938            printf(
"%s\n", code0);
 
21939            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
21940            glslang_shader_delete(shader);
 
21947        glslang_program_t* program = glslang_program_create();
 
21948        glslang_program_add_shader(program, shader);
 
21949        if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
 
21951            err = glslang_program_get_info_log(program);
 
21952            printf(
"%s\n", code0);
 
21953            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
21954            glslang_shader_delete(shader);
 
21955            glslang_program_delete(program);
 
21978        glslang_shader_delete(shader);
 
21979        VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
 
21980        VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
 
21981        pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
 
21982        VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
 
21986        res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
 
21987        if (res != VK_SUCCESS) {
 
21988            glslang_program_delete(program);
 
21994        pipelineShaderStageCreateInfo.pName = 
"main";
 
21995        pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
 
21996        computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
 
21997        computePipelineCreateInfo.layout = axis->pipelineLayout;
 
21998        res = vkCreateComputePipelines(app->
configuration.
device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
 
21999        if (res != VK_SUCCESS) {
 
22003        vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
 
22004        glslang_program_delete(program);
 
22005#elif(VKFFT_BACKEND==1) 
22007        nvrtcResult result = nvrtcCreateProgram(&prog,         
 
22015        if (result != NVRTC_SUCCESS) {
 
22016            printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
 
22025        result = nvrtcCompileProgram(prog,  
 
22028        if (result != NVRTC_SUCCESS) {
 
22029            printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
 
22030            char* log = (
char*)malloc(
sizeof(
char) * 1000000);
 
22038                nvrtcGetProgramLog(prog, log);
 
22039                printf(
"%s\n", log);
 
22042                printf(
"%s\n", code0);
 
22050        result = nvrtcGetPTXSize(prog, &ptxSize);
 
22051        if (result != NVRTC_SUCCESS) {
 
22052            printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
 
22058        char* ptx = (
char*)malloc(ptxSize);
 
22065        result = nvrtcGetPTX(prog, ptx);
 
22066        if (result != NVRTC_SUCCESS) {
 
22067            printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
 
22075        result = nvrtcDestroyProgram(&prog);
 
22076        if (result != NVRTC_SUCCESS) {
 
22077            printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
 
22086        CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
 
22088        if (result2 != CUDA_SUCCESS) {
 
22089            printf(
"cuModuleLoadDataEx error: %d\n", result2);
 
22097        result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, 
"VkFFT_main_R2C");
 
22098        if (result2 != CUDA_SUCCESS) {
 
22099            printf(
"cuModuleGetFunction error: %d\n", result2);
 
22108            result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)axis->specializationConstants.usedSharedMemory);
 
22109            if (result2 != CUDA_SUCCESS) {
 
22110                printf(
"cuFuncSetAttribute error: %d\n", result2);
 
22120        result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, 
"consts");
 
22121        if (result2 != CUDA_SUCCESS) {
 
22122            printf(
"cuModuleGetGlobal error: %d\n", result2);
 
22132#elif(VKFFT_BACKEND==2) 
22133        hiprtcProgram prog;
 
22138        enum hiprtcResult result = hiprtcCreateProgram(&prog,         
 
22144        if (result != HIPRTC_SUCCESS) {
 
22145            printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
 
22152        result = hiprtcAddNameExpression(prog, 
"&consts");
 
22153        if (result != HIPRTC_SUCCESS) {
 
22154            printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
 
22161        result = hiprtcCompileProgram(prog,  
 
22164        if (result != HIPRTC_SUCCESS) {
 
22165            printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
 
22166            char* log = (
char*)malloc(
sizeof(
char) * 100000);
 
22174                hiprtcGetProgramLog(prog, log);
 
22175                printf(
"%s\n", log);
 
22178                printf(
"%s\n", code0);
 
22186        result = hiprtcGetCodeSize(prog, &codeSize);
 
22187        if (result != HIPRTC_SUCCESS) {
 
22188            printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
 
22194        char* code = (
char*)malloc(codeSize);
 
22201        result = hiprtcGetCode(prog, code);
 
22202        if (result != HIPRTC_SUCCESS) {
 
22203            printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
 
22213        result = hiprtcDestroyProgram(&prog);
 
22214        if (result != HIPRTC_SUCCESS) {
 
22215            printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
 
22223        hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
 
22225        if (result2 != hipSuccess) {
 
22226            printf(
"hipModuleLoadDataEx error: %d\n", result2);
 
22234        result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, 
"VkFFT_main_R2C");
 
22235        if (result2 != hipSuccess) {
 
22236            printf(
"hipModuleGetFunction error: %d\n", result2);
 
22245            result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)axis->specializationConstants.usedSharedMemory);
 
22247            if (result2 != hipSuccess) {
 
22248                printf(
"hipFuncSetAttribute error: %d\n", result2);
 
22258        result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, 
"consts");
 
22259        if (result2 != hipSuccess) {
 
22260            printf(
"hipModuleGetGlobal error: %d\n", result2);
 
22271#elif(VKFFT_BACKEND==3) 
22272        size_t codelen = strlen(code0);
 
22273        axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
 
22274        if (res != CL_SUCCESS) {
 
22281        if (res != CL_SUCCESS) {
 
22283            clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
 
22284            char* log = (
char*)malloc(log_size);
 
22292                clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
 
22293                printf(
"%s\n", log);
 
22296                printf(
"%s\n", code0);
 
22303        axis->kernel = clCreateKernel(axis->program, 
"VkFFT_main_R2C", &res);
 
22304        if (res != CL_SUCCESS) {
 
22314            axis->specializationConstants.code0 = 0;
 
 
22322#if(VKFFT_BACKEND==0) 
22323    VkResult res = VK_SUCCESS;
 
22324#elif(VKFFT_BACKEND==1) 
22325    cudaError_t res = cudaSuccess;
 
22326#elif(VKFFT_BACKEND==2) 
22327    hipError_t res = hipSuccess;
 
22328#elif(VKFFT_BACKEND==3) 
22329    cl_int res = CL_SUCCESS;
 
22333    axis->specializationConstants.sourceFFTSize = app->
configuration.
size[axis_id];
 
22346    axis->specializationConstants.numAxisUploads = FFTPlan->
numAxisUploads[axis_id];
 
22347    uint64_t complexSize;
 
22349        complexSize = (2 * 
sizeof(double));
 
22352            complexSize = (2 * 
sizeof(float));
 
22354            complexSize = (2 * 
sizeof(float));
 
22355    axis->specializationConstants.complexSize = complexSize;
 
22356    axis->specializationConstants.supportAxis = 0;
 
22366    axis->specializationConstants.stageStartSize = 1;
 
22367    for (uint64_t i = 0; i < axis_upload_id; i++)
 
22368        axis->specializationConstants.stageStartSize *= FFTPlan->
axisSplit[axis_id][i];
 
22373    if (axis_id == 0) {
 
22375        axis->specializationConstants.fft_dim_x = axis->specializationConstants.stageStartSize;
 
22381        axis->specializationConstants.useBluesteinFFT = 1;
 
22385        axis->specializationConstants.actualInverse = inverse;
 
22386        axis->specializationConstants.inverse = !inverse;
 
22390            axis->specializationConstants.actualInverse = inverse;
 
22391            axis->specializationConstants.inverse = 1;
 
22394            axis->specializationConstants.actualInverse = inverse;
 
22395            axis->specializationConstants.inverse = inverse;
 
22399        axis->specializationConstants.actualInverse = inverse;
 
22400        axis->specializationConstants.inverse = reverseBluesteinMultiUpload;
 
22402            axis->specializationConstants.inverseBluestein = !inverse;
 
22406                axis->specializationConstants.inverseBluestein = 1;
 
22409                axis->specializationConstants.inverseBluestein = inverse;
 
22413    axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload;
 
22417    if ((axis_id == 0) && ((FFTPlan->
numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) {
 
22418        maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost;
 
22419        maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory));
 
22422        maxSingleSizeStrided *= axis->specializationConstants.registerBoost;
 
22423        maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided));
 
22427    axis->specializationConstants.performR2CmultiUpload = FFTPlan->
multiUploadR2C;
 
22429        axis->specializationConstants.performDCT = 2;
 
22441#if(VKFFT_BACKEND==0) 
22452#elif(VKFFT_BACKEND==1) 
22459        if (res != cudaSuccess) {
 
22463#elif(VKFFT_BACKEND==2) 
22470        if (res != hipSuccess) {
 
22474#elif(VKFFT_BACKEND==3) 
22481        if (res != CL_SUCCESS) {
 
22489        double double_PI = 3.1415926535897932384626433832795;
 
22490        uint64_t dimMult = 1;
 
22491        uint64_t maxStageSum = 0;
 
22492        for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
 
22493            switch (axis->specializationConstants.stageRadix[i]) {
 
22495                maxStageSum += dimMult;
 
22498                maxStageSum += dimMult * 2;
 
22501                maxStageSum += dimMult * 2;
 
22504                maxStageSum += dimMult * 4;
 
22507                maxStageSum += dimMult * 6;
 
22510                maxStageSum += dimMult * 3;
 
22513                maxStageSum += dimMult * 10;
 
22516                maxStageSum += dimMult * 12;
 
22519            dimMult *= axis->specializationConstants.stageRadix[i];
 
22521        axis->specializationConstants.maxStageSumLUT = maxStageSum;
 
22524            if (axis_upload_id > 0) {
 
22526                    axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
 
22527                    axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 * 
sizeof(
double);
 
22531                        axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
 
22532                        axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
 
22533                        axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 * 
sizeof(
double);
 
22536                        axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * 
sizeof(
double);
 
22541                    axis->specializationConstants.startDCT3LUT = (maxStageSum);
 
22542                    axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 * 
sizeof(
double);
 
22546                        axis->specializationConstants.startDCT3LUT = (maxStageSum);
 
22547                        axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
 
22552                        axis->bufferLUTSize = (maxStageSum) * 2 * 
sizeof(
double);
 
22555            double* tempLUT = (
double*)malloc(axis->bufferLUTSize);
 
22560            uint64_t localStageSize = 1;
 
22561            uint64_t localStageSum = 0;
 
22562            for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
 
22563                if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
 
22564                    for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
 
22565                        for (uint64_t j = 0; j < localStageSize; j++) {
 
22566                            tempLUT[2 * (j + localStageSum)] = cos(j * double_PI / localStageSize / pow(2, k));
 
22567                            tempLUT[2 * (j + localStageSum) + 1] = sin(j * double_PI / localStageSize / pow(2, k));
 
22569                        localStageSum += localStageSize;
 
22571                    localStageSize *= axis->specializationConstants.stageRadix[i];
 
22574                    for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
 
22575                        for (uint64_t j = 0; j < localStageSize; j++) {
 
22576                            tempLUT[2 * (j + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 
22577                            tempLUT[2 * (j + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 
22579                        localStageSum += localStageSize;
 
22581                    localStageSize *= axis->specializationConstants.stageRadix[i];
 
22585            if (axis_upload_id > 0) {
 
22586                for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
 
22587                    for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
 
22588                        double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
 
22589                        tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = cos(angle);
 
22590                        tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = sin(angle);
 
22596                    double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id])) * j;
 
22597                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
 
22598                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
 
22603                    double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id] / 2)) * j;
 
22604                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
 
22605                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
 
22608                    double angle = (-double_PI / 8.0 / (double)(app->
configuration.
size[axis_id] / 2)) * (2 * j + 1);
 
22609                    tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = cos(angle);
 
22610                    tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = sin(angle);
 
22613            axis->referenceLUT = 0;
 
22614            if (reverseBluesteinMultiUpload == 1) {
 
22615                axis->bufferLUT = FFTPlan->
axes[axis_id][axis_upload_id].
bufferLUT;
 
22616#if(VKFFT_BACKEND==0) 
22620                axis->referenceLUT = 1;
 
22625#if(VKFFT_BACKEND==0) 
22629                    axis->referenceLUT = 1;
 
22633                        axis->bufferLUT = FFTPlan->
axes[0][axis_upload_id].
bufferLUT;
 
22634#if(VKFFT_BACKEND==0) 
22638                        axis->referenceLUT = 1;
 
22642                            axis->bufferLUT = FFTPlan->
axes[1][axis_upload_id].
bufferLUT;
 
22643#if(VKFFT_BACKEND==0) 
22647                            axis->referenceLUT = 1;
 
22650#if(VKFFT_BACKEND==0) 
22651                            resFFT = 
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
 
22665#elif(VKFFT_BACKEND==1) 
22666                            res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
22667                            if (res != cudaSuccess) {
 
22673                            res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
 
22674                            if (res != cudaSuccess) {
 
22680#elif(VKFFT_BACKEND==2) 
22681                            res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
22682                            if (res != hipSuccess) {
 
22688                            res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
 
22689                            if (res != hipSuccess) {
 
22695#elif(VKFFT_BACKEND==3) 
22696                            axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
 
22697                            if (res != CL_SUCCESS) {
 
22712            if (axis_upload_id > 0) {
 
22714                    axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
 
22715                    axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 * 
sizeof(
float);
 
22719                        axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
 
22720                        axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (axis->specializationConstants.fftDim / 4 + 2));
 
22721                        axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 * 
sizeof(
float);
 
22724                        axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * 
sizeof(
float);
 
22729                    axis->specializationConstants.startDCT3LUT = (maxStageSum);
 
22730                    axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 * 
sizeof(
float);
 
22734                        axis->specializationConstants.startDCT3LUT = (maxStageSum);
 
22735                        axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
 
22739                        axis->bufferLUTSize = (maxStageSum) * 2 * 
sizeof(
float);
 
22742            float* tempLUT = (
float*)malloc(axis->bufferLUTSize);
 
22747            uint64_t localStageSize = 1;
 
22748            uint64_t localStageSum = 0;
 
22749            for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
 
22750                if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
 
22751                    for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
 
22752                        for (uint64_t j = 0; j < localStageSize; j++) {
 
22753                            tempLUT[2 * (j + localStageSum)] = (
float)cos(j * double_PI / localStageSize / pow(2, k));
 
22754                            tempLUT[2 * (j + localStageSum) + 1] = (
float)sin(j * double_PI / localStageSize / pow(2, k));
 
22756                        localStageSum += localStageSize;
 
22758                    localStageSize *= axis->specializationConstants.stageRadix[i];
 
22761                    for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
 
22762                        for (uint64_t j = 0; j < localStageSize; j++) {
 
22763                            tempLUT[2 * (j + localStageSum)] = (
float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 
22764                            tempLUT[2 * (j + localStageSum) + 1] = (
float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
 
22766                        localStageSum += localStageSize;
 
22768                    localStageSize *= axis->specializationConstants.stageRadix[i];
 
22772            if (axis_upload_id > 0) {
 
22773                for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
 
22774                    for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
 
22775                        double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
 
22776                        tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (
float)cos(angle);
 
22777                        tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (
float)sin(angle);
 
22783                    double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id])) * j;
 
22784                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
 
22785                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
 
22790                    double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id] / 2)) * j;
 
22791                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
 
22792                    tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
 
22795                    double angle = (-double_PI / 8.0 / (double)(app->
configuration.
size[axis_id] / 2)) * (2 * j + 1);
 
22796                    tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (float)cos(angle);
 
22797                    tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)sin(angle);
 
22800            axis->referenceLUT = 0;
 
22801            if (reverseBluesteinMultiUpload == 1) {
 
22802                axis->bufferLUT = FFTPlan->
axes[axis_id][axis_upload_id].
bufferLUT;
 
22803#if(VKFFT_BACKEND==0) 
22807                axis->referenceLUT = 1;
 
22812#if(VKFFT_BACKEND==0) 
22816                    axis->referenceLUT = 1;
 
22820                        axis->bufferLUT = FFTPlan->
axes[0][axis_upload_id].
bufferLUT;
 
22821#if(VKFFT_BACKEND==0) 
22825                        axis->referenceLUT = 1;
 
22829                            axis->bufferLUT = FFTPlan->
axes[1][axis_upload_id].
bufferLUT;
 
22830#if(VKFFT_BACKEND==0) 
22834                            axis->referenceLUT = 1;
 
22837#if(VKFFT_BACKEND==0) 
22838                            resFFT = 
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
 
22852#elif(VKFFT_BACKEND==1) 
22853                            res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
22854                            if (res != cudaSuccess) {
 
22860                            res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
 
22861                            if (res != cudaSuccess) {
 
22867#elif(VKFFT_BACKEND==2) 
22868                            res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
 
22869                            if (res != hipSuccess) {
 
22875                            res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
 
22876                            if (res != hipSuccess) {
 
22882#elif(VKFFT_BACKEND==3) 
22883                            axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
 
22884                            if (res != CL_SUCCESS) {
 
22902    uint64_t* axisStride = axis->specializationConstants.inputStride;
 
22909    if (axis_id == 0) {
 
22910        axisStride[1] = usedStride[0];
 
22911        axisStride[2] = usedStride[1];
 
22915        axisStride[1] = usedStride[0];
 
22916        axisStride[2] = usedStride[1];
 
22920        axisStride[1] = usedStride[1];
 
22921        axisStride[2] = usedStride[0];
 
22924    axisStride[3] = usedStride[2];
 
22930        if (axis_id == 0) {
 
22949    if ((!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0) && (axis->specializationConstants.performR2C) && (!(app->
configuration.
isInputFormatted))) {
 
22950        axisStride[1] *= 2;
 
22951        axisStride[2] *= 2;
 
22952        axisStride[3] *= 2;
 
22953        axisStride[4] *= 2;
 
22955    if ((FFTPlan->
multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) {
 
22956        for (uint64_t i = 1; i < 5; i++) {
 
22957            axisStride[i] /= 2;
 
22960    axisStride = axis->specializationConstants.outputStride;
 
22968    if (axis_id == 0) {
 
22969        axisStride[1] = usedStride[0];
 
22970        axisStride[2] = usedStride[1];
 
22974        axisStride[1] = usedStride[0];
 
22975        axisStride[2] = usedStride[1];
 
22979        axisStride[1] = usedStride[1];
 
22980        axisStride[2] = usedStride[0];
 
22983    axisStride[3] = usedStride[2];
 
22989        if (axis_id == 0) {
 
23009        axisStride[1] *= 2;
 
23010        axisStride[2] *= 2;
 
23011        axisStride[3] *= 2;
 
23012        axisStride[4] *= 2;
 
23015        for (uint64_t i = 1; i < 5; i++) {
 
23016            axisStride[i] /= 2;
 
23028    uint64_t storageComplexSize;
 
23030        storageComplexSize = (2 * 
sizeof(double));
 
23033            storageComplexSize = (2 * 2);
 
23035            storageComplexSize = (2 * 
sizeof(float));
 
23037    uint64_t initPageSize = -1;
 
23038    uint64_t locBufferNum = 1;
 
23039    uint64_t locBufferSize = -1;
 
23067        ((axis_id == app->
firstAxis) && (!inverse))
 
23070        uint64_t totalSize = 0;
 
23071        uint64_t locPageSize = initPageSize;
 
23080        axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23081        axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
23087            uint64_t totalSize = 0;
 
23088            uint64_t locPageSize = initPageSize;
 
23097            axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23098            axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
23103            uint64_t totalSize = 0;
 
23104            uint64_t locPageSize = initPageSize;
 
23106                if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->
useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1))) {
 
23141            axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23142            axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 
23149    locBufferSize = -1;
 
23151        ((axis_id == app->
firstAxis) && (inverse))
 
23156            ((axis_id == app->
firstAxis) && (inverse))
 
23163        uint64_t totalSize = 0;
 
23164        uint64_t locPageSize = initPageSize;
 
23173        axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23174        axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
23179        uint64_t totalSize = 0;
 
23180        uint64_t locPageSize = initPageSize;
 
23182            if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->
useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
 
23213        axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23214        axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 
23218    if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
 
23219    if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
 
23221        uint64_t totalSize = 0;
 
23222        uint64_t locPageSize = initPageSize;
 
23231        axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
 
23232        axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
 
23234        if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
 
23237        axis->specializationConstants.kernelBlockSize = 0;
 
23238        axis->specializationConstants.kernelBlockNum = 0;
 
23240    axis->numBindings = 2;
 
23241    axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
 
23242    axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
 
23243    axis->specializationConstants.numBuffersBound[2] = 0;
 
23244    axis->specializationConstants.numBuffersBound[3] = 0;
 
23245#if(VKFFT_BACKEND==0) 
23246    VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
 
23247    descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum);
 
23249    axis->specializationConstants.convolutionBindingID = -1;
 
23251        axis->specializationConstants.convolutionBindingID = axis->numBindings;
 
23252        axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
 
23253#if(VKFFT_BACKEND==0) 
23254        descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
 
23256        axis->numBindings++;
 
23259        axis->specializationConstants.convolutionBindingID = axis->numBindings;
 
23260        axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
 
23261#if(VKFFT_BACKEND==0) 
23262        descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
 
23264        axis->numBindings++;
 
23267        axis->specializationConstants.convolutionBindingID = axis->numBindings;
 
23268        axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
 
23269#if(VKFFT_BACKEND==0) 
23270        descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
 
23272        axis->numBindings++;
 
23275        axis->specializationConstants.LUTBindingID = axis->numBindings;
 
23276        axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
 
23277#if(VKFFT_BACKEND==0) 
23278        descriptorPoolSize.descriptorCount++;
 
23280        axis->numBindings++;
 
23283        if (axis->specializationConstants.inverseBluestein)
 
23287        axis->specializationConstants.BluesteinConvolutionBindingID = axis->numBindings;
 
23288        axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
 
23289#if(VKFFT_BACKEND==0) 
23290        descriptorPoolSize.descriptorCount++;
 
23292        axis->numBindings++;
 
23296        axis->specializationConstants.BluesteinMultiplicationBindingID = axis->numBindings;
 
23297        axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
 
23298#if(VKFFT_BACKEND==0) 
23299        descriptorPoolSize.descriptorCount++;
 
23301        axis->numBindings++;
 
23303#if(VKFFT_BACKEND==0) 
23304    VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
 
23305    descriptorPoolCreateInfo.poolSizeCount = 1;
 
23306    descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
 
23307    descriptorPoolCreateInfo.maxSets = 1;
 
23308    res = vkCreateDescriptorPool(app->
configuration.
device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
 
23309    if (res != VK_SUCCESS) {
 
23313    const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 
23314    VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
 
23315    descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * 
sizeof(VkDescriptorSetLayoutBinding));
 
23316    if (!descriptorSetLayoutBindings) {
 
23320    for (uint64_t i = 0; i < axis->numBindings; ++i) {
 
23321        descriptorSetLayoutBindings[i].binding = (uint32_t)i;
 
23322        descriptorSetLayoutBindings[i].descriptorType = descriptorType;
 
23323        descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
 
23324        descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
 
23327    VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
 
23328    descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
 
23329    descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
 
23331    res = vkCreateDescriptorSetLayout(app->
configuration.
device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
 
23332    if (res != VK_SUCCESS) {
 
23336    free(descriptorSetLayoutBindings);
 
23337    descriptorSetLayoutBindings = 0;
 
23338    VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
 
23339    descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
 
23340    descriptorSetAllocateInfo.descriptorSetCount = 1;
 
23341    descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
 
23342    res = vkAllocateDescriptorSets(app->
configuration.
device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
 
23343    if (res != VK_SUCCESS) {
 
23359#if(VKFFT_BACKEND==0) 
23360        VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
 
23361        pipelineLayoutCreateInfo.setLayoutCount = 1;
 
23362        pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
 
23364        VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
 
23365        pushConstantRange.offset = 0;
 
23368        pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
 
23369        pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
 
23371        res = vkCreatePipelineLayout(app->
configuration.
device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
 
23372        if (res != VK_SUCCESS) {
 
23378        axis->groupedBatch = maxBatchCoalesced;
 
23394        if (((FFTPlan->
numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) {
 
23395            axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim : axis->groupedBatch;
 
23398            axis->groupedBatch = (maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch;
 
23403        if ((FFTPlan->
numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
 
23404            axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
 
23407        if ((FFTPlan->
numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) {
 
23408            axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
 
23410        if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced;
 
23411        axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
 
23413        if (!((axis_id == 0) && (FFTPlan->
numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && (axis->specializationConstants.fftDim > maxSingleSizeStrided)) {
 
23414            axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
 
23418            axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
 
23420        if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
 
23421        if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
 
23422        uint64_t maxThreadNum = maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
 
23424        axis->specializationConstants.axisSwapped = 0;
 
23425        uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
 
23426        if (axis_id == 0) {
 
23428            if (axis_upload_id == 0) {
 
23429                axis->axisBlock[0] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
23430                if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum;
 
23432                if (axis->specializationConstants.reorderFourStep && (FFTPlan->
numAxisUploads[axis_id] > 1))
 
23433                    axis->axisBlock[1] = axis->groupedBatch;
 
23438                uint64_t currentAxisBlock1 = axis->axisBlock[1];
 
23439                for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
 
23441                        if (i * axis->specializationConstants.fftDim * complexSize <= app->configuration.sharedMemorySize) axis->axisBlock[1] = i;
 
23442                        i = 2 * currentAxisBlock1;
 
23446                if ((FFTPlan->
numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim);
 
23447                if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) {
 
23448                    axis->specializationConstants.mergeSequencesR2C = 0;
 
23467                if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
 
23468                    for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) {
 
23469                        if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum)
 
23471                            axis->axisBlock[1] /= i;
 
23472                            i = axis->axisBlock[1] + 1;
 
23477                while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2;
 
23479#if (VKFFT_BACKEND==0) 
23480                    if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) {
 
23481                        uint64_t temp = axis->axisBlock[1];
 
23482                        axis->axisBlock[1] = axis->axisBlock[0];
 
23483                        axis->axisBlock[0] = temp;
 
23484                        axis->specializationConstants.axisSwapped = 1;
 
23487                    uint64_t temp = axis->axisBlock[1];
 
23488                    axis->axisBlock[1] = axis->axisBlock[0];
 
23489                    axis->axisBlock[0] = temp;
 
23490                    axis->specializationConstants.axisSwapped = 1;
 
23493                axis->axisBlock[2] = 1;
 
23494                axis->axisBlock[3] = axis->specializationConstants.fftDim;
 
23497                axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
23499                if (scale > 1) axis->groupedBatch *= scale;
 
23500                axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize;
 
23502                if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
 
23503                    for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
 
23504                        if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 
23506                            axis->axisBlock[0] /= i;
 
23507                            i = axis->axisBlock[0] + 1;
 
23512                axis->axisBlock[2] = 1;
 
23513                axis->axisBlock[3] = axis->specializationConstants.fftDim;
 
23517        if (axis_id == 1) {
 
23519            axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
23523            if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
 
23524                for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
 
23525                    if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 
23527                        axis->axisBlock[0] /= i;
 
23528                        i = axis->axisBlock[0] + 1;
 
23533            axis->axisBlock[2] = 1;
 
23534            axis->axisBlock[3] = axis->specializationConstants.fftDim;
 
23537        if (axis_id == 2) {
 
23538            axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
23543            if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
 
23544                for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
 
23545                    if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
 
23547                        axis->axisBlock[0] /= i;
 
23548                        i = axis->axisBlock[0] + 1;
 
23553            axis->axisBlock[2] = 1;
 
23554            axis->axisBlock[3] = axis->specializationConstants.fftDim;
 
23569        axis->specializationConstants.localSize[0] = axis->axisBlock[0];
 
23570        axis->specializationConstants.localSize[1] = axis->axisBlock[1];
 
23571        axis->specializationConstants.localSize[2] = axis->axisBlock[2];
 
23580        axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : app->
configuration.
normalize;
 
23584        axis->specializationConstants.axis_id = axis_id;
 
23585        axis->specializationConstants.axis_upload_id = axis_upload_id;
 
23587        for (uint64_t i = 0; i < 3; i++) {
 
23593        if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
 
23594            axis->specializationConstants.zeropadBluestein[0] = 1;
 
23595            axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->
configuration.
size[axis_id];
 
23596            if (FFTPlan->
multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2;
 
23597            if (app->
configuration.
performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id]-2;
 
23599            axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
 
23601        if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
 
23602            axis->specializationConstants.zeropadBluestein[1] = 1;
 
23603            axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->
configuration.
size[axis_id];
 
23604            if (FFTPlan->
multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2;
 
23605            if (app->
configuration.
performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2;
 
23607            axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
 
23610            if ((app->
configuration.
frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)))) {
 
23616                axis->specializationConstants.zeropad[0] = 0;
 
23617            if ((!app->
configuration.
frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)))) {
 
23623                axis->specializationConstants.zeropad[1] = 0;
 
23632                axis->specializationConstants.zeropad[0] = 0;
 
23639                axis->specializationConstants.zeropad[1] = 0;
 
23642            axis->specializationConstants.convolutionStep = 1;
 
23645            axis->specializationConstants.convolutionStep = 0;
 
23647            axis->specializationConstants.BluesteinConvolutionStep = 1;
 
23649            axis->specializationConstants.BluesteinConvolutionStep = 0;
 
23652            axis->specializationConstants.BluesteinPreMultiplication = 1;
 
23654            axis->specializationConstants.BluesteinPreMultiplication = 0;
 
23656            axis->specializationConstants.BluesteinPostMultiplication = 1;
 
23658            axis->specializationConstants.BluesteinPostMultiplication = 0;
 
23664        if (axis_id == 0) {
 
23665            if (axis_upload_id == 0)
 
23666                tempSize[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[1];
 
23668                tempSize[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[0];
 
23669            if ((FFTPlan->
actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
 
23675            else  axis->specializationConstants.performWorkGroupShift[0] = 0;
 
23677            else  axis->specializationConstants.performWorkGroupShift[1] = 0;
 
23679            else  axis->specializationConstants.performWorkGroupShift[2] = 0;
 
23681        if (axis_id == 1) {
 
23691            else  axis->specializationConstants.performWorkGroupShift[0] = 0;
 
23693            else  axis->specializationConstants.performWorkGroupShift[1] = 0;
 
23695            else  axis->specializationConstants.performWorkGroupShift[2] = 0;
 
23698        if (axis_id == 2) {
 
23707            else  axis->specializationConstants.performWorkGroupShift[0] = 0;
 
23709            else  axis->specializationConstants.performWorkGroupShift[1] = 0;
 
23711            else  axis->specializationConstants.performWorkGroupShift[2] = 0;
 
23715        char floatTypeInputMemory[10];
 
23716        char floatTypeOutputMemory[10];
 
23717        char floatTypeKernelMemory[10];
 
23718        char floatType[10];
 
23719        axis->specializationConstants.unroll = 1;
 
23722            sprintf(floatType, 
"double");
 
23723            sprintf(floatTypeInputMemory, 
"double");
 
23724            sprintf(floatTypeOutputMemory, 
"double");
 
23725            sprintf(floatTypeKernelMemory, 
"double");
 
23731                sprintf(floatType, 
"float");
 
23734                    sprintf(floatTypeKernelMemory, 
"float");
 
23735                    if ((axis_id == app->
firstAxis) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.actualInverse))
 
23736                        sprintf(floatTypeInputMemory, 
"half");
 
23738                        sprintf(floatTypeInputMemory, 
"float");
 
23739                    if ((axis_id == app->
firstAxis) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.actualInverse))
 
23740                        sprintf(floatTypeOutputMemory, 
"half");
 
23742                        sprintf(floatTypeOutputMemory, 
"float");
 
23745                    sprintf(floatTypeInputMemory, 
"half");
 
23746                    sprintf(floatTypeOutputMemory, 
"half");
 
23747                    sprintf(floatTypeKernelMemory, 
"half");
 
23753                    sprintf(floatType, 
"double");
 
23754                    sprintf(floatTypeInputMemory, 
"float");
 
23755                    sprintf(floatTypeOutputMemory, 
"float");
 
23756                    sprintf(floatTypeKernelMemory, 
"float");
 
23759                    sprintf(floatType, 
"float");
 
23760                    sprintf(floatTypeInputMemory, 
"float");
 
23761                    sprintf(floatTypeOutputMemory, 
"float");
 
23762                    sprintf(floatTypeKernelMemory, 
"float");
 
23766        char uintType[20] = 
"";
 
23768#if(VKFFT_BACKEND==0) 
23769            sprintf(uintType, 
"uint");
 
23770#elif(VKFFT_BACKEND==1) 
23771            sprintf(uintType, 
"unsigned int");
 
23772#elif(VKFFT_BACKEND==2) 
23773            sprintf(uintType, 
"unsigned int");
 
23774#elif(VKFFT_BACKEND==3) 
23775            sprintf(uintType, 
"unsigned int");
 
23779#if(VKFFT_BACKEND==0) 
23780            sprintf(uintType, 
"uint64_t");
 
23781#elif(VKFFT_BACKEND==1) 
23782            sprintf(uintType, 
"unsigned long long");
 
23783#elif(VKFFT_BACKEND==2) 
23784            sprintf(uintType, 
"unsigned long long");
 
23785#elif(VKFFT_BACKEND==3) 
23786            sprintf(uintType, 
"unsigned long");
 
23792        if ((axis_id == 0) && (axis_upload_id == 0)) type = 0;
 
23793        if (axis_id != 0) type = 1;
 
23794        if ((axis_id == 0) && (axis_upload_id > 0)) type = 2;
 
23796        if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->
actualPerformR2CPerAxis[axis_id])) type = 5;
 
23797        if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->
actualPerformR2CPerAxis[axis_id])) type = 6;
 
23808#if(VKFFT_BACKEND==0) 
23810#elif(VKFFT_BACKEND==1) 
23811        axis->specializationConstants.cacheShuffle = 0;
 
23812#elif(VKFFT_BACKEND==2) 
23813        axis->specializationConstants.cacheShuffle = 0;
 
23814#elif(VKFFT_BACKEND==3) 
23815        axis->specializationConstants.cacheShuffle = 0;
 
23821        char* code0 = axis->specializationConstants.code0;
 
23826        resFFT = 
shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
 
23832#if(VKFFT_BACKEND==0) 
23833        const glslang_resource_t default_resource = {
 
23939        glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
 
23940        glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
 
23941        const glslang_input_t input =
 
23943            GLSLANG_SOURCE_GLSL,
 
23944            GLSLANG_STAGE_COMPUTE,
 
23945            GLSLANG_CLIENT_VULKAN,
 
23947            GLSLANG_TARGET_SPV,
 
23948            target_language_version,
 
23951            GLSLANG_NO_PROFILE,
 
23954            GLSLANG_MSG_DEFAULT_BIT,
 
23958        glslang_shader_t* shader = glslang_shader_create(&input);
 
23960        if (!glslang_shader_preprocess(shader, &input))
 
23962            err = glslang_shader_get_info_log(shader);
 
23963            printf(
"%s\n", code0);
 
23964            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
23965            glslang_shader_delete(shader);
 
23973        if (!glslang_shader_parse(shader, &input))
 
23975            err = glslang_shader_get_info_log(shader);
 
23976            printf(
"%s\n", code0);
 
23977            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
23978            glslang_shader_delete(shader);
 
23985        glslang_program_t* program = glslang_program_create();
 
23986        glslang_program_add_shader(program, shader);
 
23987        if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
 
23989            err = glslang_program_get_info_log(program);
 
23990            printf(
"%s\n", code0);
 
23991            printf(
"%s\nVkFFT shader type: %" PRIu64 
"\n", err, type);
 
23992            glslang_shader_delete(shader);
 
23993            glslang_program_delete(program);
 
24016        glslang_shader_delete(shader);
 
24017        VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
 
24018        VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
 
24019        pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
 
24020        VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
 
24024        res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
 
24025        if (res != VK_SUCCESS) {
 
24026            glslang_program_delete(program);
 
24032        pipelineShaderStageCreateInfo.pName = 
"main";
 
24033        pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
 
24034        computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
 
24035        computePipelineCreateInfo.layout = axis->pipelineLayout;
 
24036        res = vkCreateComputePipelines(app->
configuration.
device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
 
24037        if (res != VK_SUCCESS) {
 
24041        vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
 
24042        glslang_program_delete(program);
 
24043#elif(VKFFT_BACKEND==1) 
24045        nvrtcResult result = nvrtcCreateProgram(&prog,         
 
24053        if (result != NVRTC_SUCCESS) {
 
24054            printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
 
24063        result = nvrtcCompileProgram(prog,  
 
24066        if (result != NVRTC_SUCCESS) {
 
24067            printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
 
24068            char* log = (
char*)malloc(
sizeof(
char) * 1000000);
 
24076                nvrtcGetProgramLog(prog, log);
 
24077                printf(
"%s\n", log);
 
24080                printf(
"%s\n", code0);
 
24088        result = nvrtcGetPTXSize(prog, &ptxSize);
 
24089        if (result != NVRTC_SUCCESS) {
 
24090            printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
 
24096        char* ptx = (
char*)malloc(ptxSize);
 
24103        result = nvrtcGetPTX(prog, ptx);
 
24104        if (result != NVRTC_SUCCESS) {
 
24105            printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
 
24113        result = nvrtcDestroyProgram(&prog);
 
24114        if (result != NVRTC_SUCCESS) {
 
24115            printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
 
24124        CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
 
24126        if (result2 != CUDA_SUCCESS) {
 
24127            printf(
"cuModuleLoadDataEx error: %d\n", result2);
 
24135        result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, 
"VkFFT_main");
 
24136        if (result2 != CUDA_SUCCESS) {
 
24137            printf(
"cuModuleGetFunction error: %d\n", result2);
 
24146            result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)axis->specializationConstants.usedSharedMemory);
 
24147            if (result2 != CUDA_SUCCESS) {
 
24148                printf(
"cuFuncSetAttribute error: %d\n", result2);
 
24158        result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, 
"consts");
 
24159        if (result2 != CUDA_SUCCESS) {
 
24160            printf(
"cuModuleGetGlobal error: %d\n", result2);
 
24170#elif(VKFFT_BACKEND==2) 
24171        hiprtcProgram prog;
 
24176        enum hiprtcResult result = hiprtcCreateProgram(&prog,         
 
24182        if (result != HIPRTC_SUCCESS) {
 
24183            printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
 
24190        result = hiprtcAddNameExpression(prog, 
"&consts");
 
24191        if (result != HIPRTC_SUCCESS) {
 
24192            printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
 
24199        result = hiprtcCompileProgram(prog,  
 
24202        if (result != HIPRTC_SUCCESS) {
 
24203            printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
 
24204            char* log = (
char*)malloc(
sizeof(
char) * 100000);
 
24212                hiprtcGetProgramLog(prog, log);
 
24213                printf(
"%s\n", log);
 
24216                printf(
"%s\n", code0);
 
24224        result = hiprtcGetCodeSize(prog, &codeSize);
 
24225        if (result != HIPRTC_SUCCESS) {
 
24226            printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
 
24232        char* code = (
char*)malloc(codeSize);
 
24239        result = hiprtcGetCode(prog, code);
 
24240        if (result != HIPRTC_SUCCESS) {
 
24241            printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
 
24251        result = hiprtcDestroyProgram(&prog);
 
24252        if (result != HIPRTC_SUCCESS) {
 
24253            printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
 
24261        hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
 
24263        if (result2 != hipSuccess) {
 
24264            printf(
"hipModuleLoadDataEx error: %d\n", result2);
 
24272        result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, 
"VkFFT_main");
 
24273        if (result2 != hipSuccess) {
 
24274            printf(
"hipModuleGetFunction error: %d\n", result2);
 
24283            result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)axis->specializationConstants.usedSharedMemory);
 
24285            if (result2 != hipSuccess) {
 
24286                printf(
"hipFuncSetAttribute error: %d\n", result2);
 
24296        result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, 
"consts");
 
24297        if (result2 != hipSuccess) {
 
24298            printf(
"hipModuleGetGlobal error: %d\n", result2);
 
24309#elif(VKFFT_BACKEND==3) 
24310        size_t codelen = strlen(code0);
 
24311        axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
 
24312        if (res != CL_SUCCESS) {
 
24319        if (res != CL_SUCCESS) {
 
24321            clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
 
24322            char* log = (
char*)malloc(log_size);
 
24330                clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
 
24331                printf(
"%s\n", log);
 
24334                printf(
"%s\n", code0);
 
24341        axis->kernel = clCreateKernel(axis->program, 
"VkFFT_main", &res);
 
24342        if (res != CL_SUCCESS) {
 
24352            axis->specializationConstants.code0 = 0;
 
24355    if (axis->specializationConstants.axisSwapped) {
 
24356        uint64_t temp = axis->axisBlock[1];
 
24357        axis->axisBlock[1] = axis->axisBlock[0];
 
24358        axis->axisBlock[0] = temp;
 
24359        axis->specializationConstants.axisSwapped = 0;
 
 
24370#if(VKFFT_BACKEND==0) 
24373            int resGlslangInitialize = glslang_initialize_process();
 
24383    if (inputLaunchConfiguration.
device == 0) {
 
24388    if (inputLaunchConfiguration.
queue == 0) {
 
24398    if (inputLaunchConfiguration.
fence == 0) {
 
24404    VkPhysicalDeviceProperties physicalDeviceProperties = { 0 };
 
24417    switch (physicalDeviceProperties.vendorID) {
 
24457#elif(VKFFT_BACKEND==1) 
24458    CUresult res = CUDA_SUCCESS;
 
24459    cudaError_t res_t = cudaSuccess;
 
24460    if (inputLaunchConfiguration.
device == 0) {
 
24465    if (inputLaunchConfiguration.num_streams != 0)  app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
 
24466    if (inputLaunchConfiguration.stream != 0)   app->
configuration.stream = inputLaunchConfiguration.stream;
 
24469    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->
configuration.
device[0]);
 
24470    if (res != CUDA_SUCCESS) {
 
24475    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->
configuration.
device[0]);
 
24476    if (res != CUDA_SUCCESS) {
 
24481    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->
configuration.
device[0]);
 
24482    if (res != CUDA_SUCCESS) {
 
24487    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->
configuration.
device[0]);
 
24488    if (res != CUDA_SUCCESS) {
 
24493    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->
configuration.
device[0]);
 
24494    if (res != CUDA_SUCCESS) {
 
24499    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->
configuration.
device[0]);
 
24500    if (res != CUDA_SUCCESS) {
 
24505    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->
configuration.
device[0]);
 
24506    if (res != CUDA_SUCCESS) {
 
24511    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->
configuration.
device[0]);
 
24512    if (res != CUDA_SUCCESS) {
 
24517    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->
configuration.
device[0]);
 
24518    if (res != CUDA_SUCCESS) {
 
24523    res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->
configuration.
device[0]);
 
24524    if (res != CUDA_SUCCESS) {
 
24536        for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
 
24537            res_t = cudaEventCreate(&app->
configuration.stream_event[i]);
 
24538            if (res != CUDA_SUCCESS) {
 
24551#elif(VKFFT_BACKEND==2) 
24552    hipError_t res = hipSuccess;
 
24553    if (inputLaunchConfiguration.
device == 0) {
 
24558    if (inputLaunchConfiguration.num_streams != 0)  app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
 
24559    if (inputLaunchConfiguration.stream != 0)   app->
configuration.stream = inputLaunchConfiguration.stream;
 
24562    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->
configuration.
device[0]);
 
24563    if (res != hipSuccess) {
 
24568    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->
configuration.
device[0]);
 
24569    if (res != hipSuccess) {
 
24574    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->
configuration.
device[0]);
 
24575    if (res != hipSuccess) {
 
24580    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->
configuration.
device[0]);
 
24581    if (res != hipSuccess) {
 
24586    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->
configuration.
device[0]);
 
24587    if (res != hipSuccess) {
 
24592    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->
configuration.
device[0]);
 
24593    if (res != hipSuccess) {
 
24598    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->
configuration.
device[0]);
 
24599    if (res != hipSuccess) {
 
24604    res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->
configuration.
device[0]);
 
24605    if (res != hipSuccess) {
 
24612    res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->
configuration.
device[0]);
 
24613    if (res != hipSuccess) {
 
24625        for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
 
24627            if (res != hipSuccess) {
 
24639#elif(VKFFT_BACKEND==3) 
24641    if (inputLaunchConfiguration.
device == 0) {
 
24646    if (inputLaunchConfiguration.context == 0) {
 
24650    app->
configuration.context = inputLaunchConfiguration.context;
 
24651    if (inputLaunchConfiguration.platform == 0) {
 
24655    app->
configuration.platform = inputLaunchConfiguration.platform;
 
24657    size_t value_int64;
 
24658    cl_uint value_cl_uint;
 
24659    res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_VENDOR_ID, 
sizeof(cl_int), &vendorID, 0);
 
24664    res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, 
sizeof(
size_t), &value_int64, 0);
 
24670    res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, 
sizeof(cl_uint), &value_cl_uint, 0);
 
24675    size_t* dims = (
size_t*)malloc(
sizeof(
size_t) * value_cl_uint);
 
24677        res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, 
sizeof(
size_t) * value_cl_uint, dims, 0);
 
24696    cl_ulong sharedMemorySize;
 
24697    res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_LOCAL_MEM_SIZE, 
sizeof(cl_ulong), &sharedMemorySize, 0);
 
24704    switch (vendorID) {
 
24746    if (inputLaunchConfiguration.
FFTdim == 0) {
 
24751    if (inputLaunchConfiguration.
size[0] == 0) {
 
24784    for (uint64_t i = 1; i < 3; i++) {
 
24785        if (inputLaunchConfiguration.
size[i] == 0)
 
24812#if(VKFFT_BACKEND==0)  
24813    if (inputLaunchConfiguration.
bufferSize == 0) {
 
24834#if(VKFFT_BACKEND==0)  
24865#if(VKFFT_BACKEND==0)  
24892#if(VKFFT_BACKEND==0)  
24918#if(VKFFT_BACKEND==0)  
24919        if (inputLaunchConfiguration.
kernelSize == 0) {
 
24943    uint64_t checkBufferSizeFor64BitAddressing = 0;
 
24952    if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
 
24953    checkBufferSizeFor64BitAddressing = 0;
 
24958    if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
 
24960    checkBufferSizeFor64BitAddressing = 0;
 
24965    if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
 
24967    checkBufferSizeFor64BitAddressing = 0;
 
24972    if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
 
24985    if (inputLaunchConfiguration.
performR2C != 0) {
 
24988    if (inputLaunchConfiguration.
performDCT != 0) {
 
25102#if(VKFFT_BACKEND==0)  
25215#if(VKFFT_BACKEND==0) 
25217        glslang_finalize_process();
 
 
25553#if(VKFFT_BACKEND==0) 
25555    VkMemoryBarrier memory_barrier = {
 
25556            VK_STRUCTURE_TYPE_MEMORY_BARRIER,
 
25558            VK_ACCESS_SHADER_WRITE_BIT,
 
25559            VK_ACCESS_SHADER_READ_BIT,
 
25562#elif(VKFFT_BACKEND==1) 
25564#elif(VKFFT_BACKEND==2) 
25566#elif(VKFFT_BACKEND==3) 
25567    app->
configuration.commandQueue = launchParams->commandQueue;
 
25569    uint64_t localSize0[3];
 
25574    if (inverse == 1) {
 
25588    if (inverse != 1) {
 
25596#if(VKFFT_BACKEND==0) 
25598                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25600                uint64_t dispatchBlock[3];
 
25608                            dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->
localFFTPlan->
actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (
double)axis->axisBlock[1]));
 
25622                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
 
25637#if(VKFFT_BACKEND==0) 
25639                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25641                    uint64_t dispatchBlock[3];
 
25649                                dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->
localFFTPlan->
actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (
double)axis->axisBlock[1]));
 
25663                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
 
25679#if(VKFFT_BACKEND==0) 
25681                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25683                uint64_t dispatchBlock[3];
 
25686                dispatchBlock[1] = 1;
 
25708#if(VKFFT_BACKEND==0) 
25710                        vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25712                        uint64_t dispatchBlock[3];
 
25713                        dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
 
25714                        dispatchBlock[1] = 1;
 
25732#if(VKFFT_BACKEND==0) 
25734                        vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25736                        uint64_t dispatchBlock[3];
 
25738                        dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
 
25739                        dispatchBlock[1] = 1;
 
25755#if(VKFFT_BACKEND==0) 
25757                            vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25759                            uint64_t dispatchBlock[3];
 
25760                            dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
 
25761                            dispatchBlock[1] = 1;
 
25787#if(VKFFT_BACKEND==0) 
25789                        vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25791                        uint64_t dispatchBlock[3];
 
25792                        dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
 
25793                        dispatchBlock[1] = 1;
 
25809#if(VKFFT_BACKEND==0) 
25811                        vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25813                        uint64_t dispatchBlock[3];
 
25814                        dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
 
25815                        dispatchBlock[1] = 1;
 
25829#if(VKFFT_BACKEND==0) 
25831                            vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25833                            uint64_t dispatchBlock[3];
 
25834                            dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
 
25835                            dispatchBlock[1] = 1;
 
25862#if(VKFFT_BACKEND==0) 
25864                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25866                    uint64_t dispatchBlock[3];
 
25868                    dispatchBlock[1] = 1;
 
25884#if(VKFFT_BACKEND==0) 
25886                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25888                uint64_t dispatchBlock[3];
 
25890                dispatchBlock[1] = 1;
 
25910#if(VKFFT_BACKEND==0) 
25912                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25914                    uint64_t dispatchBlock[3];
 
25916                    dispatchBlock[1] = 1;
 
25932#if(VKFFT_BACKEND==0) 
25934                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25936                uint64_t dispatchBlock[3];
 
25958                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
 
25974#if(VKFFT_BACKEND==0) 
25976                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
25978                uint64_t dispatchBlock[3];
 
25980                dispatchBlock[1] = 1;
 
25993    if (inverse == 1) {
 
26004#if(VKFFT_BACKEND==0) 
26006                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
26008                    uint64_t dispatchBlock[3];
 
26010                    dispatchBlock[1] = 1;
 
26035#if(VKFFT_BACKEND==0) 
26037                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
26039                    uint64_t dispatchBlock[3];
 
26041                    dispatchBlock[1] = 1;
 
26063#if(VKFFT_BACKEND==0) 
26065                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
26067                uint64_t dispatchBlock[3];
 
26070                dispatchBlock[1] = 1;
 
26085#if(VKFFT_BACKEND==0) 
26087                vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
26089                uint64_t dispatchBlock[3];
 
26111                if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
 
26127#if(VKFFT_BACKEND==0) 
26129                    vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
 
26131                    uint64_t dispatchBlock[3];
 
26153                    if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);