1297 char LFending[4] =
"";
1298 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
1299#if(VKFFT_BACKEND==0)
1300 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
1301 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
1302 char cosDef[20] =
"cos";
1303 char sinDef[20] =
"sin";
1304 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
1305#elif(VKFFT_BACKEND==1)
1306 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1307 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1308 char cosDef[20] =
"__cosf";
1309 char sinDef[20] =
"__sinf";
1310 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
1311#elif(VKFFT_BACKEND==2)
1312 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1313 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1314 char cosDef[20] =
"__cosf";
1315 char sinDef[20] =
"__sinf";
1316 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
1317#elif(VKFFT_BACKEND==3)
1318 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1319 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1320 char cosDef[20] =
"native_cos";
1321 char sinDef[20] =
"native_sin";
1324 char* temp = sc->
temp;
1330 char convolutionInverse[30] =
"";
1331 if (sc->
convolutionStep) sprintf(convolutionInverse,
", %s inverse", uintType);
1347 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
1357 if (!strcmp(floatType,
"float")) {
1358 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n", w, cosDef);
1361 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n", w, sinDef);
1365 if (!strcmp(floatType,
"double")) {
1366 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle);\n", w);
1395 for (uint64_t i = 0; i < 2; i++) {
1396 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1398 for (uint64_t j = 0; j < i; j++) {
1406 sprintf(tf[0],
"-0.5%s", LFending);
1407 sprintf(tf[1],
"-0.8660254037844386467637231707529%s", LFending);
1417 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
1427 if (!strcmp(floatType,
"float")) {
1428 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 4.0 / 3.0, LFending);
1431 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 4.0 / 3.0, LFending);
1436 if (!strcmp(floatType,
"double")) {
1437 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 4.0 / 3.0, LFending);
1447 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n", w, stageSize);
1457 if (!strcmp(floatType,
"float")) {
1458 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 / 3.0, LFending);
1461 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 / 3.0, LFending);
1466 if (!strcmp(floatType,
"double")) {
1467 sc->
tempLen = sprintf(sc->
tempStr,
" %s=sincos_20(angle*%.17f%s);\n", w, 2.0 / 3.0, LFending);
1523 for (uint64_t i = 0; i < 2; i++) {
1542 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
1552 if (!strcmp(floatType,
"float")) {
1553 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n", w, cosDef);
1556 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n", w, sinDef);
1560 if (!strcmp(floatType,
"double")) {
1561 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle);\n", w);
1589 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n", w, stageSize);
1599 if (!strcmp(floatType,
"float")) {
1600 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
1603 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
1607 if (!strcmp(floatType,
"double")) {
1608 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
1624 if (stageAngle < 0) {
1681 for (uint64_t i = 0; i < 5; i++) {
1682 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1684 for (uint64_t j = 0; j < i; j++) {
1691 sprintf(tf[0],
"-0.5%s", LFending);
1692 sprintf(tf[1],
"1.538841768587626701285145288018455%s", LFending);
1693 sprintf(tf[2],
"-0.363271264002680442947733378740309%s", LFending);
1694 sprintf(tf[3],
"-0.809016994374947424102293417182819%s", LFending);
1695 sprintf(tf[4],
"-0.587785252292473129168705954639073%s", LFending);
1706 for (uint64_t i = radix - 1; i > 0; i--) {
1707 if (i == radix - 1) {
1709 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
1719 if (!strcmp(floatType,
"float")) {
1720 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1723 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1728 if (!strcmp(floatType,
"double")) {
1729 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1737 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n", w, (radix - 1 - i) * stageSize);
1747 if (!strcmp(floatType,
"float")) {
1748 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1751 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1756 if (!strcmp(floatType,
"double")) {
1757 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1870 for (uint64_t i = 0; i < 5; i++) {
1887 for (uint64_t i = 0; i < 8; i++) {
1888 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1890 for (uint64_t j = 0; j < i; j++) {
1897 sprintf(tf[0],
"-1.16666666666666651863693004997913%s", LFending);
1898 sprintf(tf[1],
"0.79015646852540022404554065360571%s", LFending);
1899 sprintf(tf[2],
"0.05585426728964774240049351305970%s", LFending);
1900 sprintf(tf[3],
"0.73430220123575240531721419756650%s", LFending);
1901 if (stageAngle < 0) {
1902 sprintf(tf[4],
"0.44095855184409837868031445395900%s", LFending);
1903 sprintf(tf[5],
"0.34087293062393136944265847887436%s", LFending);
1904 sprintf(tf[6],
"-0.53396936033772524066165487965918%s", LFending);
1905 sprintf(tf[7],
"0.87484229096165666561546458979137%s", LFending);
1908 sprintf(tf[4],
"-0.44095855184409837868031445395900%s", LFending);
1909 sprintf(tf[5],
"-0.34087293062393136944265847887436%s", LFending);
1910 sprintf(tf[6],
"0.53396936033772524066165487965918%s", LFending);
1911 sprintf(tf[7],
"-0.87484229096165666561546458979137%s", LFending);
1920 for (uint64_t i = radix - 1; i > 0; i--) {
1921 if (i == radix - 1) {
1923 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
1933 if (!strcmp(floatType,
"float")) {
1934 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1937 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1942 if (!strcmp(floatType,
"double")) {
1943 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
1951 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n", w, (radix - 1 - i) * stageSize);
1961 if (!strcmp(floatType,
"float")) {
1962 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
1965 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
1970 if (!strcmp(floatType,
"double")) {
1971 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2141 for (uint64_t i = 0; i < 8; i++) {
2161 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
2171 if (!strcmp(floatType,
"float")) {
2172 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n", w, cosDef);
2175 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n", w, sinDef);
2179 if (!strcmp(floatType,
"double")) {
2180 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle);\n", w);
2185 for (uint64_t i = 0; i < 4; i++) {
2199 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n\n", w, stageSize);
2209 if (!strcmp(floatType,
"float")) {
2210 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
2213 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
2217 if (!strcmp(floatType,
"double")) {
2218 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
2223 for (uint64_t i = 0; i < 2; i++) {
2236 if (stageAngle < 0) {
2255 for (uint64_t i = 4; i < 6; i++) {
2270 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n\n", w, 2 * stageSize);
2280 if (!strcmp(floatType,
"float")) {
2281 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
2284 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
2289 if (!strcmp(floatType,
"double")) {
2290 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
2306 if (stageAngle < 0) {
2335 if (stageAngle < 0) {
2336 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
2339 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
2344 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
2347 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
2362 if (stageAngle < 0) {
2420 for (uint64_t i = 0; i < 20; i++) {
2421 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
2423 for (uint64_t j = 0; j < i; j++) {
2432 sprintf(tf[0],
"-1.100000000000000%s", LFending);
2434 sprintf(tf[2],
"0.253097611605959%s", LFending);
2435 sprintf(tf[3],
"-1.288200610773679%s", LFending);
2436 sprintf(tf[4],
"0.304632239669212%s", LFending);
2437 sprintf(tf[5],
"-0.391339615511917%s", LFending);
2438 sprintf(tf[6],
"-2.871022253392850%s", LFending);
2439 sprintf(tf[7],
"1.374907986616384%s", LFending);
2440 sprintf(tf[8],
"0.817178135341212%s", LFending);
2441 sprintf(tf[9],
"1.800746506445679%s", LFending);
2442 sprintf(tf[10],
"-0.859492973614498%s", LFending);
2444 if (stageAngle < 0) {
2445 sprintf(tf[1],
"0.331662479035540%s", LFending);
2446 sprintf(tf[11],
"-2.373470454748280%s", LFending);
2447 sprintf(tf[12],
"-0.024836393087493%s", LFending);
2448 sprintf(tf[13],
"0.474017017512829%s", LFending);
2449 sprintf(tf[14],
"0.742183927770612%s", LFending);
2450 sprintf(tf[15],
"1.406473309094609%s", LFending);
2451 sprintf(tf[16],
"-1.191364552195948%s", LFending);
2452 sprintf(tf[17],
"0.708088885039503%s", LFending);
2453 sprintf(tf[18],
"0.258908260614168%s", LFending);
2454 sprintf(tf[19],
"-0.049929922194110%s", LFending);
2457 sprintf(tf[1],
"-0.331662479035540%s", LFending);
2458 sprintf(tf[11],
"2.373470454748280%s", LFending);
2459 sprintf(tf[12],
"0.024836393087493%s", LFending);
2460 sprintf(tf[13],
"-0.474017017512829%s", LFending);
2461 sprintf(tf[14],
"-0.742183927770612%s", LFending);
2462 sprintf(tf[15],
"-1.406473309094609%s", LFending);
2463 sprintf(tf[16],
"1.191364552195948%s", LFending);
2464 sprintf(tf[17],
"-0.708088885039503%s", LFending);
2465 sprintf(tf[18],
"-0.258908260614168%s", LFending);
2466 sprintf(tf[19],
"0.049929922194110%s", LFending);
2468 for (uint64_t i = radix - 1; i > 0; i--) {
2469 if (i == radix - 1) {
2471 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
2481 if (!strcmp(floatType,
"float")) {
2482 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2485 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2490 if (!strcmp(floatType,
"double")) {
2491 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2499 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n", w, (radix - 1 - i) * stageSize);
2509 if (!strcmp(floatType,
"float")) {
2510 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2513 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2518 if (!strcmp(floatType,
"double")) {
2519 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2530 uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
2533 for (uint64_t i = 0; i < 5; i++) {
2541 for (uint64_t i = 0; i < 4; i++) {
2549 for (uint64_t i = 0; i < 4; i++) {
2562 for (uint64_t k = 0; k < 2; k++) {
2630 res =
VkAddComplex(sc, regID[k * 4 + 3], sc->
locID[k * 4 + 3], regID[k * 4 + 5]);
2632 res =
VkAddComplex(sc, regID[k * 4 + 4], sc->
locID[k * 4 + 4], regID[k * 4 + 5]);
2635 res =
VkAddComplex(sc, regID[k * 4 + 5], sc->
locID[k * 4 + 5], regID[k * 4 + 6]);
2637 res =
VkAddComplex(sc, regID[k * 4 + 6], sc->
locID[k * 4 + 6], regID[k * 4 + 6]);
2646 for (uint64_t i = 0; i < 4; i++) {
2654 for (uint64_t i = 0; i < 4; i++) {
2660 for (uint64_t i = 0; i < 5; i++) {
2666 uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
2667 res =
VkPermute(sc, permute2, 11, 1, regID);
2670 for (uint64_t i = 0; i < 20; i++) {
2682 for (uint64_t i = 0; i < 20; i++) {
2683 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
2685 for (uint64_t j = 0; j < i; j++) {
2694 sprintf(tf[0],
"-1.083333333333333%s", LFending);
2695 sprintf(tf[1],
"-0.300462606288666%s", LFending);
2696 sprintf(tf[5],
"1.007074065727533%s", LFending);
2697 sprintf(tf[6],
"0.731245990975348%s", LFending);
2698 sprintf(tf[7],
"-0.579440018900960%s", LFending);
2699 sprintf(tf[8],
"0.531932498429674%s", LFending);
2700 sprintf(tf[9],
"-0.508814921720398%s", LFending);
2701 sprintf(tf[10],
"-0.007705858903092%s", LFending);
2703 if (stageAngle < 0) {
2704 sprintf(tf[2],
"-0.749279330626139%s", LFending);
2705 sprintf(tf[3],
"0.401002128321867%s", LFending);
2706 sprintf(tf[4],
"0.174138601152136%s", LFending);
2707 sprintf(tf[11],
"-2.511393318389568%s", LFending);
2708 sprintf(tf[12],
"-1.823546408682421%s", LFending);
2709 sprintf(tf[13],
"1.444979909023996%s", LFending);
2710 sprintf(tf[14],
"-1.344056915177370%s", LFending);
2711 sprintf(tf[15],
"-0.975932420775946%s", LFending);
2712 sprintf(tf[16],
"0.773329778651105%s", LFending);
2713 sprintf(tf[17],
"1.927725116783469%s", LFending);
2714 sprintf(tf[18],
"1.399739414729183%s", LFending);
2715 sprintf(tf[19],
"-1.109154843837551%s", LFending);
2718 sprintf(tf[2],
"0.749279330626139%s", LFending);
2719 sprintf(tf[3],
"-0.401002128321867%s", LFending);
2720 sprintf(tf[4],
"-0.174138601152136%s", LFending);
2721 sprintf(tf[11],
"2.511393318389568%s", LFending);
2722 sprintf(tf[12],
"1.823546408682421%s", LFending);
2723 sprintf(tf[13],
"-1.444979909023996%s", LFending);
2724 sprintf(tf[14],
"1.344056915177370%s", LFending);
2725 sprintf(tf[15],
"0.975932420775946%s", LFending);
2726 sprintf(tf[16],
"-0.773329778651105%s", LFending);
2727 sprintf(tf[17],
"-1.927725116783469%s", LFending);
2728 sprintf(tf[18],
"-1.399739414729183%s", LFending);
2729 sprintf(tf[19],
"1.109154843837551%s", LFending);
2731 for (uint64_t i = radix - 1; i > 0; i--) {
2732 if (i == radix - 1) {
2734 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId];\n", w);
2744 if (!strcmp(floatType,
"float")) {
2745 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2748 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2753 if (!strcmp(floatType,
"double")) {
2754 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2762 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n", w, (radix - 1 - i) * stageSize);
2772 if (!strcmp(floatType,
"float")) {
2773 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n", w, cosDef, 2.0 * i / radix, LFending);
2776 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n", w, sinDef, 2.0 * i / radix, LFending);
2781 if (!strcmp(floatType,
"double")) {
2782 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n", w, 2.0 * i / radix, LFending);
2794 uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
2797 for (uint64_t i = 0; i < 6; i++) {
2803 for (uint64_t i = 0; i < 3; i++) {
2809 for (uint64_t i = 0; i < 4; i++) {
2812 res =
VkSubComplex(sc, sc->
locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
2816 res =
VkSubComplex(sc, sc->
locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
2826 for (uint64_t k = 0; k < 3; k++) {
2846 res =
VkAddComplex(sc, regID[k * 2 + 3], sc->
locID[k * 2 + 3], regID[k * 2 + 4]);
2848 res =
VkAddComplex(sc, regID[k * 2 + 4], sc->
locID[k * 2 + 4], regID[k * 2 + 4]);
2907 for (uint64_t i = 0; i < 4; i++) {
2917 for (uint64_t i = 0; i < 3; i++) {
2925 for (uint64_t i = 0; i < 6; i++) {
2931 uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 };
2932 res =
VkPermute(sc, permute2, 13, 1, regID);
2935 for (uint64_t i = 0; i < 20; i++) {
3855 double double_PI = 3.1415926535897932384626433832795;
3857 char inputsStruct[20] =
"";
3858 char LFending[4] =
"";
3859 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
3860#if(VKFFT_BACKEND==0)
3862 sprintf(inputsStruct,
"inputs");
3864 sprintf(inputsStruct,
".inputs");
3865 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
3866 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
3867 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
3868 char cosDef[20] =
"cos";
3869 char sinDef[20] =
"sin";
3870#elif(VKFFT_BACKEND==1)
3871 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3872 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3873 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
3874 sprintf(inputsStruct,
"inputs");
3875 char cosDef[20] =
"__cosf";
3876 char sinDef[20] =
"__sinf";
3877#elif(VKFFT_BACKEND==2)
3878 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3879 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3880 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
3881 sprintf(inputsStruct,
"inputs");
3882 char cosDef[20] =
"__cosf";
3883 char sinDef[20] =
"__sinf";
3884#elif(VKFFT_BACKEND==3)
3885 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3886 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3887 sprintf(inputsStruct,
"inputs");
3888 char cosDef[20] =
"native_cos";
3889 char sinDef[20] =
"native_sin";
3891 char convTypeLeft[20] =
"";
3892 char convTypeRight[20] =
"";
3893 if ((!strcmp(floatType,
"float")) && (strcmp(floatTypeMemory,
"float"))) {
3894 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3895#if(VKFFT_BACKEND==0)
3896 sprintf(convTypeLeft,
"float(");
3897 sprintf(convTypeRight,
")");
3898#elif(VKFFT_BACKEND==1)
3899 sprintf(convTypeLeft,
"(float)");
3901#elif(VKFFT_BACKEND==2)
3902 sprintf(convTypeLeft,
"(float)");
3904#elif(VKFFT_BACKEND==3)
3905 sprintf(convTypeLeft,
"(float)");
3910#if(VKFFT_BACKEND==0)
3911 sprintf(convTypeLeft,
"vec2(");
3912 sprintf(convTypeRight,
")");
3913#elif(VKFFT_BACKEND==1)
3914 sprintf(convTypeLeft,
"conv_float2(");
3915 sprintf(convTypeRight,
")");
3916#elif(VKFFT_BACKEND==2)
3917 sprintf(convTypeLeft,
"conv_float2(");
3918 sprintf(convTypeRight,
")");
3919#elif(VKFFT_BACKEND==3)
3920 sprintf(convTypeLeft,
"conv_float2(");
3921 sprintf(convTypeRight,
")");
3925 if ((!strcmp(floatType,
"double")) && (strcmp(floatTypeMemory,
"double"))) {
3926 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3927#if(VKFFT_BACKEND==0)
3928 sprintf(convTypeLeft,
"double(");
3929 sprintf(convTypeRight,
")");
3930#elif(VKFFT_BACKEND==1)
3931 sprintf(convTypeLeft,
"(double)");
3933#elif(VKFFT_BACKEND==2)
3934 sprintf(convTypeLeft,
"(double)");
3936#elif(VKFFT_BACKEND==3)
3937 sprintf(convTypeLeft,
"(double)");
3942#if(VKFFT_BACKEND==0)
3943 sprintf(convTypeLeft,
"dvec2(");
3944 sprintf(convTypeRight,
")");
3945#elif(VKFFT_BACKEND==1)
3946 sprintf(convTypeLeft,
"conv_double2(");
3947 sprintf(convTypeRight,
")");
3948#elif(VKFFT_BACKEND==2)
3949 sprintf(convTypeLeft,
"conv_double2(");
3950 sprintf(convTypeRight,
")");
3951#elif(VKFFT_BACKEND==3)
3952 sprintf(convTypeLeft,
"conv_double2(");
3953 sprintf(convTypeRight,
")");
3957 char index_x[2000] =
"";
3958 char index_y[2000] =
"";
3959 char requestCoordinate[100] =
"";
3962 sprintf(requestCoordinate,
"coordinate");
3965 char requestBatch[100] =
"";
3968 sprintf(requestBatch,
"0");
3976 char shiftX[500] =
"";
3978 sprintf(shiftX,
" + consts.workGroupShiftX ");
3979 char shiftY[500] =
"";
3988 char shiftY2[100] =
"";
3990 sprintf(shiftY,
" + consts.workGroupShiftY ");
3998 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
4005 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
4076 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4078 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4082 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4084 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4102 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4105 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4110 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4113 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4133 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4136 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4141 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4144 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4189 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
4194 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");\n", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
4225 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")] = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4253 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4256 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4283 char shiftX[500] =
"";
4287 sprintf(sc->
disableThreadsStart,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
4293 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
4329 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[%s*(%s+%" PRIu64
")+%s]=%sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4386 char shiftX[500] =
"";
4397 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s%s) %% (%" PRIu64
") + %" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
4432 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[%s*(%s+%" PRIu64
")+%s]=%sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4489 char shiftX[500] =
"";
4491 sprintf(shiftX,
" + consts.workGroupShiftX ");
4492 char shiftY[500] =
"";
4494 sprintf(shiftY,
" + consts.workGroupShiftY ");
4514 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
4521 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
4558 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %sinputBlocks[(%s + %" PRIu64
")/ %" PRIu64
"]%s[(%s + %" PRIu64
") %% %" PRIu64
"]%s;\n", sc->
regIDs[i + k * sc->
registers_per_thread], convTypeLeft, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, convTypeRight);
4575 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4577 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4586 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4594 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride+ (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4596 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4603 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4613 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4621 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4623 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4643 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4646 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4651 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4654 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4676 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4679 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4684 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4687 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4699 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
4706 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
4722 char shiftX[500] =
"";
4724 sprintf(shiftX,
" + consts.workGroupShiftX ");
4725 char shiftY[500] =
"";
4728 char shiftY2[100] =
"";
4730 sprintf(shiftY,
" + consts.workGroupShiftY ");
4733 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
4735 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
4753 for (uint64_t i = 0; i < num_in; i++) {
4764 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * %" PRIu64
";\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, sc->
inputStride[1]);
4817 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4819 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4825 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4827 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4846 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4849 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4854 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4857 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4910 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
4982 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
5056 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
5128 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
5207 char shiftX[500] =
"";
5209 sprintf(shiftX,
" + consts.workGroupShiftX ");
5210 char shiftY[500] =
"";
5212 sprintf(shiftY,
" + consts.workGroupShiftY ");
5225 for (uint64_t i = 0; i < num_in; i++) {
5240 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5257 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5289 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
5294 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5306 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5320 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim-1);
5323 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", 2*sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5326 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5334 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
5338 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5348 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5362 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5365 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5368 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5388 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
5393 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
5405 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5408 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5411 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5419 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5422 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5425 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5442 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5454 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5472 char shiftX[500] =
"";
5474 sprintf(shiftX,
" + consts.workGroupShiftX ");
5475 char shiftX2[500] =
"";
5478 char shiftY[500] =
"";
5480 sprintf(shiftY,
" + consts.workGroupShiftY ");
5491 uint64_t num_in = (uint64_t)ceil((sc->
fftDim) / (
double)sc->
localSize[1]);
5493 for (uint64_t i = 0; i < num_in; i++) {
5499 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5521 sc->
tempLen = sprintf(sc->
tempStr,
" //sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (%s + ((%s + %" PRIu64
") %% %" PRIu64
") * %" PRIu64
") / %" PRIu64
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
5539 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5552 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5583 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5589 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5627 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5633 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5653 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5670 char shiftX[500] =
"";
5672 sprintf(shiftX,
" + consts.workGroupShiftX ");
5673 char shiftY[500] =
"";
5675 sprintf(shiftY,
" + consts.workGroupShiftY ");
5701 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5713 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5740 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5745 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5757 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5773 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5777 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5787 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5815 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5820 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5836 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5843 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5860 char shiftX[500] =
"";
5862 sprintf(shiftX,
" + consts.workGroupShiftX ");
5863 char shiftX2[500] =
"";
5866 char shiftY[500] =
"";
5868 sprintf(shiftY,
" + consts.workGroupShiftY ");
5885 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5902 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (%s + ((%s + %" PRIu64
") %% %" PRIu64
") * %" PRIu64
") / %" PRIu64
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
5920 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5933 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6005 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6021 char shiftX[500] =
"";
6023 sprintf(shiftX,
" + consts.workGroupShiftX ");
6024 char shiftY[500] =
"";
6026 sprintf(shiftY,
" + consts.workGroupShiftY ");
6038 for (uint64_t i = 0; i < num_in; i++) {
6095 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
6098 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
6108 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6120 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, convTypeRight);
6147 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6152 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6156 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
6177 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6189 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, convTypeRight);
6221 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6226 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6295 char shiftX[500] =
"";
6297 sprintf(shiftX,
" + consts.workGroupShiftX ");
6298 char shiftX2[500] =
"";
6301 char shiftY[500] =
"";
6303 sprintf(shiftY,
" + consts.workGroupShiftY ");
6305 uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
6316 for (uint64_t i = 0; i < num_in; i++) {
6322 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6353 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6367 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6398 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID) );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending);
6401 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID) );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending);
6432 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6443 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6502 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6524 char shiftX[500] =
"";
6526 sprintf(shiftX,
" + consts.workGroupShiftX ");
6527 char shiftY[500] =
"";
6536 char shiftY2[100] =
"";
6538 sprintf(shiftY,
" + consts.workGroupShiftY ");
6544 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
6551 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
6621 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6630 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[2*(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6633 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6636 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6639 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6645 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6648 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6651 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6654 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6659 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[2*(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6662 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(2*(combinedID %% %" PRIu64
")+1) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6665 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6668 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6674 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6677 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6680 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6683 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6700 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
6703 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
6708 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
6711 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
6845 char shiftX[500] =
"";
6849 sprintf(sc->
disableThreadsStart,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
6873 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
6896 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6967 char shiftX[500] =
"";
6969 sprintf(shiftX,
" + consts.workGroupShiftX ");
6970 char shiftY[500] =
"";
6972 sprintf(shiftY,
" + consts.workGroupShiftY ");
6981 uint64_t maxBluesteinCutOff = 1;
7019 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7038#if(VKFFT_BACKEND!=3)
7040 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
7046 if (i < sc->min_registers_per_thread) {
7063#if(VKFFT_BACKEND!=3)
7066 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7070 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7075 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7103 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7107 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7111 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7120 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7154#if(VKFFT_BACKEND==3)
7186 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7207 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7211 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7215 if (i < sc->min_registers_per_thread) {
7216 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7227 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7246 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7250 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7254 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7263 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7330 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7351 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7355 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7359 if (i < sc->min_registers_per_thread) {
7360 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7371 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7390 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7394 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7398 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7407 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7463 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7468 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
7471 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
7479 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
7482 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-1].y;\n", sc->
w);
7486 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = sdata[sdataID].x;\n", sc->
w);
7502 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim - 1, sc->
fftDim);
7505 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim - 1, sc->
fftDim);
7542 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7547 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
7550 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
7560#if(VKFFT_BACKEND!=3)
7562 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7565 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7596#if(VKFFT_BACKEND==3)
7609 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7617 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7620 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7650 for (uint64_t i = 0; i < num_in; i++) {
7678 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
7681 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
7687 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7692 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7700 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
7705 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7710 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7736 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" == 0){\n", sc->
fftDim / 2 + 1);
7775 char shiftX[500] =
"";
7777 sprintf(shiftX,
" + consts.workGroupShiftX ");
7778 char shiftX2[500] =
"";
7781 char shiftY[500] =
"";
7783 sprintf(shiftY,
" + consts.workGroupShiftY ");
7816 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7829#if(VKFFT_BACKEND!=3)
7831 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
7837 if (i < sc->min_registers_per_thread) {
7854#if(VKFFT_BACKEND!=3)
7855 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7882 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7916#if(VKFFT_BACKEND==3)
7941 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7954 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7958 if (i < sc->min_registers_per_thread) {
7987 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8047 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8060 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8064 if (i < sc->min_registers_per_thread) {
8093 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8146 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8153 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
8157 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = sdata[sdataID].x;\n", sc->
w);
8172 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim - 1, sc->
localSize[0]);
8212 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8222#if(VKFFT_BACKEND!=3)
8223 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8251#if(VKFFT_BACKEND==3)
8271 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8295 uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
8298 for (uint64_t i = 0; i < num_in; i++) {
8318 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID / %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
8321 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID / %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
8326 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8338 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8391 char shiftX[500] =
"";
8393 sprintf(shiftX,
" + consts.workGroupShiftX ");
8394 char shiftY[500] =
"";
8396 sprintf(shiftY,
" + consts.workGroupShiftY ");
8422 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
8434 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
8461 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
8466 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
8478 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8494 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
8498 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8508 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8536 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
8541 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
8557 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
8564 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
8596 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * (combinedID %% %" PRIu64
");\n", sc->
fftDim / 2, sc->
fftDim);
8600 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8603 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8606 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8609 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8612 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8621 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8626 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8655 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * combinedID;\n", sc->
fftDim / 2);
8659 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8662 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8665 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8668 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8671 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8680 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8685 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8709 char shiftX[500] =
"";
8711 sprintf(shiftX,
" + consts.workGroupShiftX ");
8712 char shiftX2[500] =
"";
8715 char shiftY[500] =
"";
8717 sprintf(shiftY,
" + consts.workGroupShiftY ");
8734 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
8766 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8779 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
8810 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
8840 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * combinedID;\n", sc->
fftDim / 2);
8844 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8847 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8850 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8853 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8856 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8865 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8870 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
11538 double double_PI = 3.1415926535897932384626433832795;
11540 char outputsStruct[20] =
"";
11541 char LFending[4] =
"";
11542 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
11543#if(VKFFT_BACKEND==0)
11544 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
11545 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
11547 sprintf(outputsStruct,
"outputs");
11549 sprintf(outputsStruct,
".outputs");
11550 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
11551 char cosDef[20] =
"cos";
11552 char sinDef[20] =
"sin";
11553#elif(VKFFT_BACKEND==1)
11554 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11555 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11556 sprintf(outputsStruct,
"outputs");
11557 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
11558 char cosDef[20] =
"__cosf";
11559 char sinDef[20] =
"__sinf";
11560#elif(VKFFT_BACKEND==2)
11561 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11562 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11563 sprintf(outputsStruct,
"outputs");
11564 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
11565 char cosDef[20] =
"__cosf";
11566 char sinDef[20] =
"__sinf";
11567#elif(VKFFT_BACKEND==3)
11568 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11569 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11570 sprintf(outputsStruct,
"outputs");
11572 char cosDef[20] =
"native_cos";
11573 char sinDef[20] =
"native_sin";
11575 char convTypeLeft[20] =
"";
11576 char convTypeRight[20] =
"";
11577 if ((!strcmp(floatTypeMemory,
"half")) && (strcmp(floatType,
"half"))) {
11578 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11579 sprintf(convTypeLeft,
"float16_t(");
11580 sprintf(convTypeRight,
")");
11583 sprintf(convTypeLeft,
"f16vec2(");
11584 sprintf(convTypeRight,
")");
11587 if ((!strcmp(floatTypeMemory,
"float")) && (strcmp(floatType,
"float"))) {
11588 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11589#if(VKFFT_BACKEND==0)
11590 sprintf(convTypeLeft,
"float(");
11591 sprintf(convTypeRight,
")");
11592#elif(VKFFT_BACKEND==1)
11593 sprintf(convTypeLeft,
"(float)");
11595#elif(VKFFT_BACKEND==2)
11596 sprintf(convTypeLeft,
"(float)");
11598#elif(VKFFT_BACKEND==3)
11599 sprintf(convTypeLeft,
"(float)");
11604#if(VKFFT_BACKEND==0)
11605 sprintf(convTypeLeft,
"vec2(");
11606 sprintf(convTypeRight,
")");
11607#elif(VKFFT_BACKEND==1)
11608 sprintf(convTypeLeft,
"conv_float2(");
11609 sprintf(convTypeRight,
")");
11610#elif(VKFFT_BACKEND==2)
11611 sprintf(convTypeLeft,
"conv_float2(");
11612 sprintf(convTypeRight,
")");
11613#elif(VKFFT_BACKEND==3)
11614 sprintf(convTypeLeft,
"conv_float2(");
11615 sprintf(convTypeRight,
")");
11619 if ((!strcmp(floatTypeMemory,
"double")) && (strcmp(floatType,
"double"))) {
11620 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11621#if(VKFFT_BACKEND==0)
11622 sprintf(convTypeLeft,
"double(");
11623 sprintf(convTypeRight,
")");
11624#elif(VKFFT_BACKEND==1)
11625 sprintf(convTypeLeft,
"(double)");
11627#elif(VKFFT_BACKEND==2)
11628 sprintf(convTypeLeft,
"(double)");
11630#elif(VKFFT_BACKEND==3)
11631 sprintf(convTypeLeft,
"(double)");
11636#if(VKFFT_BACKEND==0)
11637 sprintf(convTypeLeft,
"dvec2(");
11638 sprintf(convTypeRight,
")");
11639#elif(VKFFT_BACKEND==1)
11640 sprintf(convTypeLeft,
"conv_double2(");
11641 sprintf(convTypeRight,
")");
11642#elif(VKFFT_BACKEND==2)
11643 sprintf(convTypeLeft,
"conv_double2(");
11644 sprintf(convTypeRight,
")");
11645#elif(VKFFT_BACKEND==3)
11646 sprintf(convTypeLeft,
"conv_double2(");
11647 sprintf(convTypeRight,
")");
11652 char index_x[2000] =
"";
11653 char index_y[2000] =
"";
11654 char requestCoordinate[100] =
"";
11657 sprintf(requestCoordinate,
"coordinate");
11660 char requestBatch[100] =
"";
11663 sprintf(requestBatch,
"batchID");
11666 switch (writeType) {
11675 char shiftX[500] =
"";
11677 sprintf(shiftX,
" + consts.workGroupShiftX ");
11678 char shiftY[500] =
"";
11688 char shiftY2[100] =
"";
11690 sprintf(shiftY,
" + consts.workGroupShiftY ");
11699 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
11709 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
11783 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11785 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11791 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11793 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11837 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = combinedID %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")+ ((%s%s) %% %" PRIu64
") * %" PRIu64
";\n", sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
localSize[0], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
11845 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = combinedID %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")+ ((%s%s) %% %" PRIu64
") * %" PRIu64
";\n", sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
11877 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_x, sc->
gl_WorkGroupSize_x, convTypeRight);
11885 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_y, sc->
gl_WorkGroupSize_y, convTypeRight);
11975 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11977 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11983 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11985 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12033 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
")+(combinedID / %" PRIu64
") * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
12038 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
12108 char shiftX[500] =
"";
12111 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
12117 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * (%" PRIu64
") + (((%s%s) / %" PRIu64
") %% (%" PRIu64
")) * (%" PRIu64
") + ((%s%s) / %" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
12149 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12169 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12178 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12190 sprintf(index_y,
"%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
")", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12191 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
12211 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[inoutID / %" PRIu64
"]%s[inoutID %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12244 char shiftX[500] =
"";
12252 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s%s) %% (%" PRIu64
") + %" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12287 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[inoutID / %" PRIu64
"]%s[inoutID %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12319 char shiftX[500] =
"";
12321 sprintf(shiftX,
" + consts.workGroupShiftX ");
12322 char shiftY[500] =
"";
12326 sprintf(shiftY,
" + consts.workGroupShiftY ");
12342 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
12357 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
12371 for (uint64_t i = 0; i < num_out; i++) {
12391 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12403 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12440 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
12443 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12446 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12452 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12455 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12462 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12469 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
12472 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12475 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12481 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12484 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12491 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12501 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
12509 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
12540 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12547 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12580 char shiftY[500] =
"";
12582 sprintf(shiftY,
" + consts.workGroupShiftY * %" PRIu64
"", sc->
localSize[1]);
12613 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12620 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12649 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s.x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
12660 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s.y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
12670 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12672 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12680 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12682 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride+ (combinedID / %" PRIu64
")].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12689 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12691 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12699 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12701 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12720 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12727 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12751 char shiftX[500] =
"";
12753 sprintf(shiftX,
" + consts.workGroupShiftX ");
12754 char shiftY[500] =
"";
12757 char shiftY2[500] =
"";
12759 sprintf(shiftY2,
" + consts.workGroupShiftY ");
12776 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
12791 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
12805 for (uint64_t i = 0; i < num_out; i++) {
12864 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")]);\n", sc->
regIDs[0], sc->
fftDim, sc->
fftDim);
12868 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12877 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
12887 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]);\n", sc->
regIDs[0], sc->
fftDim, sc->
fftDim);
12891 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12899 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
12911 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
12915 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12927 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
12931 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12994 char shiftX[500] =
"";
12997 char shiftY[500] =
"";
13000 char shiftY2[500] =
"";
13002 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13018 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13029 uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim) / (
double)sc->
localSize[1]);
13031 for (uint64_t i = 0; i < num_out; i++) {
13065 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13068 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13072 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13074 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13079 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13082 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13089 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13100 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13102 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13108 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13116 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
13120 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13162 char shiftX[500] =
"";
13164 sprintf(shiftX,
" + consts.workGroupShiftX ");
13165 char shiftY[500] =
"";
13168 char shiftY2[500] =
"";
13170 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13186 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13201 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
13215 for (uint64_t i = 0; i < num_out; i++) {
13283 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = 2*%s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
13286 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = 2*%s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
13293 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13296 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13300 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13302 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13307 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13310 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13317 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13325 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13344 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13346 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13352 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13365 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13368 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13372 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13374 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13377 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13380 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13387 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13395 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13414 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13416 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13422 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13437 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
13441 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13451 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13470 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13485 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
13489 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13499 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13518 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13583 char shiftX[500] =
"";
13586 char shiftY[500] =
"";
13589 char shiftY2[500] =
"";
13591 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13606 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13617 uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1]);
13619 for (uint64_t i = 0; i < num_out; i++) {
13663 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = 2*%s(%.17f%s * (combinedID / %" PRIu64
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
13666 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = 2*%s(%.17f%s * (combinedID / %" PRIu64
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
13672 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13675 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13679 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13681 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13686 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13689 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13696 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13707 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13709 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13715 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13723 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
13727 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13756 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13800 char shiftX[500] =
"";
13802 sprintf(shiftX,
" + consts.workGroupShiftX ");
13803 char shiftY[500] =
"";
13806 char shiftY2[500] =
"";
13808 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13818 uint64_t maxBluesteinCutOff = 1;
13835 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
13881 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13886 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13892 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
13901 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
13909 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13913 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13920 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13924 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13976 char shiftX[500] =
"";
13979 char shiftY[500] =
"";
13982 char shiftY2[500] =
"";
13984 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13999 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
14048 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14051 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14055 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14057 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14062 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14065 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14072 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
14083 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14085 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14091 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
14099 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID / %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID / %" PRIu64
") %% 2)) * ((combinedID / %" PRIu64
")/2)) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
14103 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
14149 char shiftX[500] =
"";
14151 sprintf(shiftX,
" + consts.workGroupShiftX ");
14152 char shiftY[500] =
"";
14162 char shiftY2[100] =
"";
14164 sprintf(shiftY,
" + consts.workGroupShiftY ");
14173 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
14179 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
14252 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14254 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14260 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[2*(combinedID %% %" PRIu64
")+1 + (combinedID / %" PRIu64
") * sharedStride].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14262 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[2*(combinedID %% %" PRIu64
")+1 + (combinedID / %" PRIu64
") * sharedStride].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14549 char shiftX[500] =
"";
14553 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
14564 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * (%" PRIu64
") + (((%s%s) / %" PRIu64
") %% (%" PRIu64
")) * (%" PRIu64
") + ((%s%s) / %" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
14587 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[%s*(2*(%s+%" PRIu64
")+1) + %s].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
14662 char shiftX[500] =
"";
14664 sprintf(shiftX,
" + consts.workGroupShiftX ");
14665 char shiftY[500] =
"";
14668 char shiftY2[500] =
"";
14670 sprintf(shiftY2,
" + consts.workGroupShiftY ");
14679 uint64_t maxBluesteinCutOff = 1;
14696 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
14701 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
14706 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
14760 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14772 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (2*(combinedID %% %" PRIu64
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
14775 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (2*(combinedID %% %" PRIu64
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
14795 sc->
tempLen = sprintf(index_x,
"%" PRIu64
" - combinedID %% %" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")", 2 * sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
outputStride[1]);
14799 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14867 char shiftX[500] =
"";
14869 sprintf(shiftX,
" + consts.workGroupShiftX ");
14870 char shiftX2[500] =
"";
14873 char shiftY[500] =
"";
14876 char shiftY2[500] =
"";
14878 sprintf(shiftY2,
" + consts.workGroupShiftY ");
14901 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID / %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID / %" PRIu64
") %% 2)) * ((combinedID / %" PRIu64
")/2)) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
14946 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
14959 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (2*(combinedID / %" PRIu64
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
14962 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (2*(combinedID / %" PRIu64
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
14987 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
15043 char shiftX[500] =
"";
15045 sprintf(shiftX,
" + consts.workGroupShiftX ");
15046 char shiftY[500] =
"";
15049 char shiftY2[500] =
"";
15051 sprintf(shiftY2,
" + consts.workGroupShiftY ");
15067 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
15082 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
15109 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
15121 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
15164 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15167 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, mult*sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15170 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15176 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15179 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15187 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15190 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15193 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15199 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15202 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15212 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], sc->
fftDim);
15214 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], sc->
fftDim);
15218 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID + 1)/2) %% 2) != 0) \n\
15224 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15235 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", sc->
fftDim/2, sc->
fftDim/4);
15240 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15243 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15246 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15252 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15255 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15263 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15266 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15269 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15275 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15278 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15288 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15290 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15294 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15300 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15311 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
15316 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15319 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15322 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15328 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15331 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15339 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15342 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15345 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15351 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15354 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15364 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15366 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15370 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15376 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15392 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15395 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15398 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15404 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15407 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15415 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15418 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15421 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15427 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15430 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15440 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
15442 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
15446 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15452 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15461 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
15465 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
15494 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
15501 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
15578 char shiftX[500] =
"";
15581 char shiftY[500] =
"";
15584 char shiftY2[500] =
"";
15586 sprintf(shiftY2,
" + consts.workGroupShiftY ");
15638 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID + 1)/2) %% 2) != 0) \n\
15644 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15655 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", sc->
fftDim / 2, sc->
fftDim / 4);
15661 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15667 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15678 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
15684 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15690 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15707 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15713 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15722 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
15726 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
16442 char vecTypeInput[30];
16443 char vecTypeOutput[30];
16444#if(VKFFT_BACKEND==0)
16445 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16446 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
16447 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
16448 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16449 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"vec2");
16450 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"dvec2");
16451 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16452 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"vec2");
16453 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"dvec2");
16466#elif(VKFFT_BACKEND==1)
16467 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16468 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16469 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16470 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16471 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16472 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16473 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16474 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16475 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16488#elif(VKFFT_BACKEND==2)
16489 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16490 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16491 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16492 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16493 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16494 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16495 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16496 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16497 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16510#elif(VKFFT_BACKEND==3)
16511 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16512 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16513 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16514 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16515 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16516 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16517 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16518 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16519 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16535 sprintf(sc->
tshuffle,
"tshuffle");
16538 sprintf(sc->
inoutID,
"inoutID");
16539 sprintf(sc->
sdataID,
"sdataID");
16558 res =
appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
16573 if ((!sc->
LUT) && (!strcmp(floatType,
"double"))) {
16580 if (strcmp(floatType, floatTypeInputMemory)) {
16587 if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
16648 uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->
axisSwapped)) ? 1 : type;
16649#if(VKFFT_BACKEND==0)
16661#elif(VKFFT_BACKEND==1)
16662 sc->
tempLen = sprintf(sc->
tempStr,
"extern __shared__ float shared[];\n");
16677 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16682 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16687 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16692 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16697 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16702 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16707 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16712 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16717 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16722 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16727 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16732 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16737 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16742 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16747 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16758 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16766 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16774 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinConvolutionKernel", vecType);
16782 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinMultiplication", vecType);
16801#elif(VKFFT_BACKEND==2)
16802 sc->
tempLen = sprintf(sc->
tempStr,
"extern __shared__ float shared[];\n");
16817 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16822 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16827 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16832 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16837 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16842 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16847 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16852 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16857 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16862 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16867 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16872 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16877 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16882 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16887 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16897 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16905 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16913 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinConvolutionKernel", vecType);
16921 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinMultiplication", vecType);
16940#elif(VKFFT_BACKEND==3)
16950 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput);
16955 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory);
16960 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16965 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16970 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16975 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16980 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16985 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16990 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16995 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17000 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17005 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17010 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17015 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17020 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
17030 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* kernel_obj", vecType);
17038 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* twiddleLUT", vecType);
17046 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* BluesteinConvolutionKernel", vecType);
17054 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* BluesteinMultiplication", vecType);
17097 sc->
tempLen = sprintf(sc->
tempStr,
" for (%s coordinate=%" PRIu64
"; coordinate > 0; coordinate--){\n\
17128 uint64_t stageSize = 1;
17129 uint64_t stageSizeSum = 0;
17130 double PI_const = 3.1415926535897932384626433832795;
17131 double stageAngle = (sc->
inverse) ? PI_const : -PI_const;
17132 for (uint64_t i = 0; i < sc->
numStages; i++) {
17154 stageSizeSum += stageSize;
17157 stageSizeSum += stageSize * 2;
17160 stageSizeSum += stageSize * 2;
17163 stageSizeSum += stageSize * 4;
17166 stageSizeSum += stageSize * 6;
17169 stageSizeSum += stageSize * 3;
17172 stageSizeSum += stageSize * 10;
17175 stageSizeSum += stageSize * 12;
17250 stageAngle = PI_const;
17252 for (uint64_t i = 0; i < sc->
numStages; i++) {
17260 stageSizeSum += stageSize;
17263 stageSizeSum += stageSize * 2;
17266 stageSizeSum += stageSize * 2;
17269 stageSizeSum += stageSize * 4;
17272 stageSizeSum += stageSize * 6;
17275 stageSizeSum += stageSize * 3;
17278 stageSizeSum += stageSize * 10;
17281 stageSizeSum += stageSize * 12;
17683static inline VkFFTResult VkFFTGetRegistersPerThread(uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) {
17684 for (uint64_t i = 0; i < 14; i++) {
17685 registers_per_thread_per_radix[i] = 0;
17687 registers_per_thread[0] = 0;
17688 min_registers_per_thread[0] = -1;
17690 if (loc_multipliers[2] > 0) {
17691 if (loc_multipliers[3] > 0) {
17692 if (loc_multipliers[5] > 0) {
17693 if (loc_multipliers[7] > 0) {
17694 if (loc_multipliers[11] > 0) {
17695 if (loc_multipliers[13] > 0) {
17696 switch (loc_multipliers[2]) {
17698 registers_per_thread_per_radix[2] = 14;
17699 registers_per_thread_per_radix[3] = 15;
17702 registers_per_thread_per_radix[2] = 12;
17703 registers_per_thread_per_radix[3] = 12;
17706 registers_per_thread_per_radix[2] = 12;
17707 registers_per_thread_per_radix[3] = 12;
17710 registers_per_thread_per_radix[2] = 16;
17711 registers_per_thread_per_radix[3] = 12;
17714 registers_per_thread_per_radix[5] = 15;
17715 registers_per_thread_per_radix[7] = 14;
17716 registers_per_thread_per_radix[11] = 11;
17717 registers_per_thread_per_radix[13] = 13;
17720 switch (loc_multipliers[2]) {
17722 registers_per_thread_per_radix[2] = 14;
17723 registers_per_thread_per_radix[3] = 15;
17726 registers_per_thread_per_radix[2] = 12;
17727 registers_per_thread_per_radix[3] = 12;
17730 registers_per_thread_per_radix[2] = 12;
17731 registers_per_thread_per_radix[3] = 12;
17734 registers_per_thread_per_radix[2] = 16;
17735 registers_per_thread_per_radix[3] = 12;
17738 registers_per_thread_per_radix[5] = 15;
17739 registers_per_thread_per_radix[7] = 14;
17740 registers_per_thread_per_radix[11] = 11;
17741 registers_per_thread_per_radix[13] = 0;
17745 if (loc_multipliers[13] > 0) {
17746 switch (loc_multipliers[2]) {
17748 registers_per_thread_per_radix[2] = 14;
17749 registers_per_thread_per_radix[3] = 15;
17752 registers_per_thread_per_radix[2] = 12;
17753 registers_per_thread_per_radix[3] = 12;
17756 registers_per_thread_per_radix[2] = 12;
17757 registers_per_thread_per_radix[3] = 12;
17760 registers_per_thread_per_radix[2] = 16;
17761 registers_per_thread_per_radix[3] = 12;
17764 registers_per_thread_per_radix[5] = 15;
17765 registers_per_thread_per_radix[7] = 14;
17766 registers_per_thread_per_radix[11] = 0;
17767 registers_per_thread_per_radix[13] = 13;
17771 switch (loc_multipliers[2]) {
17773 registers_per_thread_per_radix[2] = 14;
17774 registers_per_thread_per_radix[3] = 15;
17778 registers_per_thread_per_radix[2] = 12;
17779 registers_per_thread_per_radix[3] = 12;
17782 registers_per_thread_per_radix[2] = 12;
17783 registers_per_thread_per_radix[3] = 12;
17786 registers_per_thread_per_radix[2] = 16;
17787 registers_per_thread_per_radix[3] = 12;
17790 registers_per_thread_per_radix[5] = 15;
17791 registers_per_thread_per_radix[7] = 14;
17792 registers_per_thread_per_radix[11] = 0;
17793 registers_per_thread_per_radix[13] = 0;
17798 if (loc_multipliers[11] > 0) {
17799 if (loc_multipliers[13] > 0) {
17800 switch (loc_multipliers[2]) {
17802 registers_per_thread_per_radix[2] = 10;
17803 registers_per_thread_per_radix[3] = 15;
17806 registers_per_thread_per_radix[2] = 12;
17807 registers_per_thread_per_radix[3] = 12;
17810 registers_per_thread_per_radix[2] = 12;
17811 registers_per_thread_per_radix[3] = 12;
17814 registers_per_thread_per_radix[5] = 10;
17815 registers_per_thread_per_radix[7] = 0;
17816 registers_per_thread_per_radix[11] = 11;
17817 registers_per_thread_per_radix[13] = 13;
17820 switch (loc_multipliers[2]) {
17822 registers_per_thread_per_radix[2] = 10;
17823 registers_per_thread_per_radix[3] = 15;
17826 registers_per_thread_per_radix[2] = 12;
17827 registers_per_thread_per_radix[3] = 12;
17830 registers_per_thread_per_radix[2] = 12;
17831 registers_per_thread_per_radix[3] = 12;
17834 registers_per_thread_per_radix[5] = 10;
17835 registers_per_thread_per_radix[7] = 0;
17836 registers_per_thread_per_radix[11] = 11;
17837 registers_per_thread_per_radix[13] = 0;
17841 if (loc_multipliers[13] > 0) {
17842 switch (loc_multipliers[2]) {
17844 registers_per_thread_per_radix[2] = 10;
17845 registers_per_thread_per_radix[3] = 15;
17848 registers_per_thread_per_radix[2] = 12;
17849 registers_per_thread_per_radix[3] = 12;
17852 registers_per_thread_per_radix[2] = 12;
17853 registers_per_thread_per_radix[3] = 12;
17856 registers_per_thread_per_radix[5] = 10;
17857 registers_per_thread_per_radix[7] = 0;
17858 registers_per_thread_per_radix[11] = 0;
17859 registers_per_thread_per_radix[13] = 13;
17862 switch (loc_multipliers[2]) {
17864 registers_per_thread_per_radix[2] = 6;
17865 registers_per_thread_per_radix[3] = 6;
17866 registers_per_thread_per_radix[5] = 5;
17869 registers_per_thread_per_radix[2] = 12;
17870 registers_per_thread_per_radix[3] = 12;
17871 registers_per_thread_per_radix[5] = 10;
17874 registers_per_thread_per_radix[2] = 12;
17875 registers_per_thread_per_radix[3] = 12;
17876 registers_per_thread_per_radix[5] = 10;
17879 registers_per_thread_per_radix[7] = 0;
17880 registers_per_thread_per_radix[11] = 0;
17881 registers_per_thread_per_radix[13] = 0;
17889 if (loc_multipliers[7] > 0) {
17890 if (loc_multipliers[11] > 0) {
17891 if (loc_multipliers[13] > 0) {
17892 switch (loc_multipliers[2]) {
17894 registers_per_thread_per_radix[2] = 22;
17895 registers_per_thread_per_radix[3] = 21;
17896 registers_per_thread_per_radix[5] = 0;
17897 registers_per_thread_per_radix[7] = 21;
17898 registers_per_thread_per_radix[11] = 22;
17899 registers_per_thread_per_radix[13] = 26;
17902 registers_per_thread_per_radix[2] = 12;
17903 registers_per_thread_per_radix[3] = 12;
17904 registers_per_thread_per_radix[5] = 0;
17905 registers_per_thread_per_radix[7] = 14;
17906 registers_per_thread_per_radix[11] = 11;
17907 registers_per_thread_per_radix[13] = 13;
17910 registers_per_thread_per_radix[2] = 12;
17911 registers_per_thread_per_radix[3] = 12;
17912 registers_per_thread_per_radix[5] = 0;
17913 registers_per_thread_per_radix[7] = 14;
17914 registers_per_thread_per_radix[11] = 11;
17915 registers_per_thread_per_radix[13] = 13;
17920 switch (loc_multipliers[2]) {
17922 registers_per_thread_per_radix[2] = 22;
17923 registers_per_thread_per_radix[3] = 21;
17924 registers_per_thread_per_radix[5] = 0;
17925 registers_per_thread_per_radix[7] = 21;
17926 registers_per_thread_per_radix[11] = 22;
17927 registers_per_thread_per_radix[13] = 0;
17930 registers_per_thread_per_radix[2] = 12;
17931 registers_per_thread_per_radix[3] = 12;
17932 registers_per_thread_per_radix[5] = 0;
17933 registers_per_thread_per_radix[7] = 14;
17934 registers_per_thread_per_radix[11] = 11;
17935 registers_per_thread_per_radix[13] = 0;
17938 registers_per_thread_per_radix[2] = 12;
17939 registers_per_thread_per_radix[3] = 12;
17940 registers_per_thread_per_radix[5] = 0;
17941 registers_per_thread_per_radix[7] = 14;
17942 registers_per_thread_per_radix[11] = 11;
17943 registers_per_thread_per_radix[13] = 0;
17949 if (loc_multipliers[13] > 0) {
17950 switch (loc_multipliers[2]) {
17952 registers_per_thread_per_radix[2] = 26;
17953 registers_per_thread_per_radix[3] = 21;
17954 registers_per_thread_per_radix[5] = 0;
17955 registers_per_thread_per_radix[7] = 21;
17956 registers_per_thread_per_radix[11] = 0;
17957 registers_per_thread_per_radix[13] = 26;
17960 registers_per_thread_per_radix[2] = 12;
17961 registers_per_thread_per_radix[3] = 12;
17962 registers_per_thread_per_radix[5] = 0;
17963 registers_per_thread_per_radix[7] = 14;
17964 registers_per_thread_per_radix[11] = 0;
17965 registers_per_thread_per_radix[13] = 13;
17968 registers_per_thread_per_radix[2] = 12;
17969 registers_per_thread_per_radix[3] = 12;
17970 registers_per_thread_per_radix[5] = 0;
17971 registers_per_thread_per_radix[7] = 14;
17972 registers_per_thread_per_radix[11] = 0;
17973 registers_per_thread_per_radix[13] = 13;
17978 switch (loc_multipliers[2]) {
17980 registers_per_thread_per_radix[2] = 6;
17981 registers_per_thread_per_radix[3] = 6;
17982 registers_per_thread_per_radix[5] = 0;
17983 registers_per_thread_per_radix[7] = 7;
17984 registers_per_thread_per_radix[11] = 0;
17985 registers_per_thread_per_radix[13] = 0;
17988 registers_per_thread_per_radix[2] = 6;
17989 registers_per_thread_per_radix[3] = 6;
17990 registers_per_thread_per_radix[5] = 0;
17991 registers_per_thread_per_radix[7] = 7;
17992 registers_per_thread_per_radix[11] = 0;
17993 registers_per_thread_per_radix[13] = 0;
17996 registers_per_thread_per_radix[2] = 8;
17997 registers_per_thread_per_radix[3] = 6;
17998 registers_per_thread_per_radix[5] = 0;
17999 registers_per_thread_per_radix[7] = 7;
18000 registers_per_thread_per_radix[11] = 0;
18001 registers_per_thread_per_radix[13] = 0;
18008 if (loc_multipliers[11] > 0) {
18009 if (loc_multipliers[13] > 0) {
18010 switch (loc_multipliers[2]) {
18012 registers_per_thread_per_radix[2] = 6;
18013 registers_per_thread_per_radix[3] = 6;
18014 registers_per_thread_per_radix[5] = 0;
18015 registers_per_thread_per_radix[7] = 0;
18016 registers_per_thread_per_radix[11] = 11;
18017 registers_per_thread_per_radix[13] = 13;
18020 registers_per_thread_per_radix[2] = 12;
18021 registers_per_thread_per_radix[3] = 12;
18022 registers_per_thread_per_radix[5] = 0;
18023 registers_per_thread_per_radix[7] = 0;
18024 registers_per_thread_per_radix[11] = 11;
18025 registers_per_thread_per_radix[13] = 13;
18028 registers_per_thread_per_radix[2] = 12;
18029 registers_per_thread_per_radix[3] = 12;
18030 registers_per_thread_per_radix[5] = 0;
18031 registers_per_thread_per_radix[7] = 0;
18032 registers_per_thread_per_radix[11] = 11;
18033 registers_per_thread_per_radix[13] = 13;
18038 switch (loc_multipliers[2]) {
18040 registers_per_thread_per_radix[2] = 6;
18041 registers_per_thread_per_radix[3] = 6;
18042 registers_per_thread_per_radix[5] = 0;
18043 registers_per_thread_per_radix[7] = 0;
18044 registers_per_thread_per_radix[11] = 11;
18045 registers_per_thread_per_radix[13] = 0;
18048 registers_per_thread_per_radix[2] = 12;
18049 registers_per_thread_per_radix[3] = 12;
18050 registers_per_thread_per_radix[5] = 0;
18051 registers_per_thread_per_radix[7] = 0;
18052 registers_per_thread_per_radix[11] = 11;
18053 registers_per_thread_per_radix[13] = 0;
18056 registers_per_thread_per_radix[2] = 12;
18057 registers_per_thread_per_radix[3] = 12;
18058 registers_per_thread_per_radix[5] = 0;
18059 registers_per_thread_per_radix[7] = 0;
18060 registers_per_thread_per_radix[11] = 11;
18061 registers_per_thread_per_radix[13] = 0;
18067 if (loc_multipliers[13] > 0) {
18068 switch (loc_multipliers[2]) {
18070 registers_per_thread_per_radix[2] = 6;
18071 registers_per_thread_per_radix[3] = 6;
18072 registers_per_thread_per_radix[5] = 0;
18073 registers_per_thread_per_radix[7] = 0;
18074 registers_per_thread_per_radix[11] = 0;
18075 registers_per_thread_per_radix[13] = 13;
18078 registers_per_thread_per_radix[2] = 12;
18079 registers_per_thread_per_radix[3] = 12;
18080 registers_per_thread_per_radix[5] = 0;
18081 registers_per_thread_per_radix[7] = 0;
18082 registers_per_thread_per_radix[11] = 0;
18083 registers_per_thread_per_radix[13] = 13;
18086 registers_per_thread_per_radix[2] = 12;
18087 registers_per_thread_per_radix[3] = 12;
18088 registers_per_thread_per_radix[5] = 0;
18089 registers_per_thread_per_radix[7] = 0;
18090 registers_per_thread_per_radix[11] = 0;
18091 registers_per_thread_per_radix[13] = 13;
18096 switch (loc_multipliers[2]) {
18098 registers_per_thread_per_radix[2] = 6;
18099 registers_per_thread_per_radix[3] = 6;
18100 registers_per_thread_per_radix[5] = 0;
18101 registers_per_thread_per_radix[7] = 0;
18102 registers_per_thread_per_radix[11] = 0;
18103 registers_per_thread_per_radix[13] = 0;
18106 registers_per_thread_per_radix[2] = 12;
18107 registers_per_thread_per_radix[3] = 12;
18108 registers_per_thread_per_radix[5] = 0;
18109 registers_per_thread_per_radix[7] = 0;
18110 registers_per_thread_per_radix[11] = 0;
18111 registers_per_thread_per_radix[13] = 0;
18114 registers_per_thread_per_radix[2] = 12;
18115 registers_per_thread_per_radix[3] = 12;
18116 registers_per_thread_per_radix[5] = 0;
18117 registers_per_thread_per_radix[7] = 0;
18118 registers_per_thread_per_radix[11] = 0;
18119 registers_per_thread_per_radix[13] = 0;
18128 if (loc_multipliers[5] > 0) {
18129 if (loc_multipliers[7] > 0) {
18130 if (loc_multipliers[11] > 0) {
18131 if (loc_multipliers[13] > 0) {
18132 switch (loc_multipliers[2]) {
18134 registers_per_thread_per_radix[2] = 10;
18135 registers_per_thread_per_radix[3] = 0;
18136 registers_per_thread_per_radix[5] = 10;
18137 registers_per_thread_per_radix[7] = 14;
18138 registers_per_thread_per_radix[11] = 11;
18139 registers_per_thread_per_radix[13] = 13;
18142 registers_per_thread_per_radix[2] = 10;
18143 registers_per_thread_per_radix[3] = 0;
18144 registers_per_thread_per_radix[5] = 10;
18145 registers_per_thread_per_radix[7] = 14;
18146 registers_per_thread_per_radix[11] = 11;
18147 registers_per_thread_per_radix[13] = 13;
18150 registers_per_thread_per_radix[2] = 8;
18151 registers_per_thread_per_radix[3] = 0;
18152 registers_per_thread_per_radix[5] = 10;
18153 registers_per_thread_per_radix[7] = 14;
18154 registers_per_thread_per_radix[11] = 11;
18155 registers_per_thread_per_radix[13] = 13;
18158 registers_per_thread_per_radix[2] = 16;
18159 registers_per_thread_per_radix[3] = 0;
18160 registers_per_thread_per_radix[5] = 10;
18161 registers_per_thread_per_radix[7] = 14;
18162 registers_per_thread_per_radix[11] = 11;
18163 registers_per_thread_per_radix[13] = 13;
18168 switch (loc_multipliers[2]) {
18170 registers_per_thread_per_radix[2] = 10;
18171 registers_per_thread_per_radix[3] = 0;
18172 registers_per_thread_per_radix[5] = 10;
18173 registers_per_thread_per_radix[7] = 14;
18174 registers_per_thread_per_radix[11] = 11;
18175 registers_per_thread_per_radix[13] = 0;
18178 registers_per_thread_per_radix[2] = 10;
18179 registers_per_thread_per_radix[3] = 0;
18180 registers_per_thread_per_radix[5] = 10;
18181 registers_per_thread_per_radix[7] = 14;
18182 registers_per_thread_per_radix[11] = 11;
18183 registers_per_thread_per_radix[13] = 0;
18186 registers_per_thread_per_radix[2] = 8;
18187 registers_per_thread_per_radix[3] = 0;
18188 registers_per_thread_per_radix[5] = 10;
18189 registers_per_thread_per_radix[7] = 14;
18190 registers_per_thread_per_radix[11] = 11;
18191 registers_per_thread_per_radix[13] = 0;
18194 registers_per_thread_per_radix[2] = 16;
18195 registers_per_thread_per_radix[3] = 0;
18196 registers_per_thread_per_radix[5] = 10;
18197 registers_per_thread_per_radix[7] = 14;
18198 registers_per_thread_per_radix[11] = 11;
18199 registers_per_thread_per_radix[13] = 0;
18205 if (loc_multipliers[13] > 0) {
18206 switch (loc_multipliers[2]) {
18208 registers_per_thread_per_radix[2] = 10;
18209 registers_per_thread_per_radix[3] = 0;
18210 registers_per_thread_per_radix[5] = 10;
18211 registers_per_thread_per_radix[7] = 14;
18212 registers_per_thread_per_radix[11] = 0;
18213 registers_per_thread_per_radix[13] = 13;
18216 registers_per_thread_per_radix[2] = 10;
18217 registers_per_thread_per_radix[3] = 0;
18218 registers_per_thread_per_radix[5] = 10;
18219 registers_per_thread_per_radix[7] = 14;
18220 registers_per_thread_per_radix[11] = 0;
18221 registers_per_thread_per_radix[13] = 13;
18224 registers_per_thread_per_radix[2] = 8;
18225 registers_per_thread_per_radix[3] = 0;
18226 registers_per_thread_per_radix[5] = 10;
18227 registers_per_thread_per_radix[7] = 14;
18228 registers_per_thread_per_radix[11] = 0;
18229 registers_per_thread_per_radix[13] = 13;
18232 registers_per_thread_per_radix[2] = 16;
18233 registers_per_thread_per_radix[3] = 0;
18234 registers_per_thread_per_radix[5] = 10;
18235 registers_per_thread_per_radix[7] = 14;
18236 registers_per_thread_per_radix[11] = 0;
18237 registers_per_thread_per_radix[13] = 13;
18242 switch (loc_multipliers[2]) {
18244 registers_per_thread_per_radix[2] = 10;
18245 registers_per_thread_per_radix[3] = 0;
18246 registers_per_thread_per_radix[5] = 10;
18247 registers_per_thread_per_radix[7] = 7;
18248 registers_per_thread_per_radix[11] = 0;
18249 registers_per_thread_per_radix[13] = 0;
18252 registers_per_thread_per_radix[2] = 10;
18253 registers_per_thread_per_radix[3] = 0;
18254 registers_per_thread_per_radix[5] = 10;
18255 registers_per_thread_per_radix[7] = 7;
18256 registers_per_thread_per_radix[11] = 0;
18257 registers_per_thread_per_radix[13] = 0;
18260 registers_per_thread_per_radix[2] = 8;
18261 registers_per_thread_per_radix[3] = 0;
18262 registers_per_thread_per_radix[5] = 10;
18263 registers_per_thread_per_radix[7] = 7;
18264 registers_per_thread_per_radix[11] = 0;
18265 registers_per_thread_per_radix[13] = 0;
18272 if (loc_multipliers[11] > 0) {
18273 if (loc_multipliers[13] > 0) {
18274 switch (loc_multipliers[2]) {
18276 registers_per_thread_per_radix[2] = 10;
18277 registers_per_thread_per_radix[3] = 0;
18278 registers_per_thread_per_radix[5] = 10;
18279 registers_per_thread_per_radix[7] = 0;
18280 registers_per_thread_per_radix[11] = 11;
18281 registers_per_thread_per_radix[13] = 13;
18284 registers_per_thread_per_radix[2] = 10;
18285 registers_per_thread_per_radix[3] = 0;
18286 registers_per_thread_per_radix[5] = 10;
18287 registers_per_thread_per_radix[7] = 0;
18288 registers_per_thread_per_radix[11] = 11;
18289 registers_per_thread_per_radix[13] = 13;
18292 registers_per_thread_per_radix[2] = 8;
18293 registers_per_thread_per_radix[3] = 0;
18294 registers_per_thread_per_radix[5] = 10;
18295 registers_per_thread_per_radix[7] = 0;
18296 registers_per_thread_per_radix[11] = 11;
18297 registers_per_thread_per_radix[13] = 13;
18302 switch (loc_multipliers[2]) {
18304 registers_per_thread_per_radix[2] = 10;
18305 registers_per_thread_per_radix[3] = 0;
18306 registers_per_thread_per_radix[5] = 10;
18307 registers_per_thread_per_radix[7] = 0;
18308 registers_per_thread_per_radix[11] = 11;
18309 registers_per_thread_per_radix[13] = 0;
18312 registers_per_thread_per_radix[2] = 10;
18313 registers_per_thread_per_radix[3] = 0;
18314 registers_per_thread_per_radix[5] = 10;
18315 registers_per_thread_per_radix[7] = 0;
18316 registers_per_thread_per_radix[11] = 11;
18317 registers_per_thread_per_radix[13] = 0;
18320 registers_per_thread_per_radix[2] = 8;
18321 registers_per_thread_per_radix[3] = 0;
18322 registers_per_thread_per_radix[5] = 10;
18323 registers_per_thread_per_radix[7] = 0;
18324 registers_per_thread_per_radix[11] = 11;
18325 registers_per_thread_per_radix[13] = 0;
18331 if (loc_multipliers[13] > 0) {
18332 switch (loc_multipliers[2]) {
18334 registers_per_thread_per_radix[2] = 10;
18335 registers_per_thread_per_radix[3] = 0;
18336 registers_per_thread_per_radix[5] = 10;
18337 registers_per_thread_per_radix[7] = 0;
18338 registers_per_thread_per_radix[11] = 0;
18339 registers_per_thread_per_radix[13] = 13;
18342 registers_per_thread_per_radix[2] = 10;
18343 registers_per_thread_per_radix[3] = 0;
18344 registers_per_thread_per_radix[5] = 10;
18345 registers_per_thread_per_radix[7] = 0;
18346 registers_per_thread_per_radix[11] = 0;
18347 registers_per_thread_per_radix[13] = 13;
18350 registers_per_thread_per_radix[2] = 8;
18351 registers_per_thread_per_radix[3] = 0;
18352 registers_per_thread_per_radix[5] = 10;
18353 registers_per_thread_per_radix[7] = 0;
18354 registers_per_thread_per_radix[11] = 0;
18355 registers_per_thread_per_radix[13] = 13;
18360 switch (loc_multipliers[2]) {
18362 registers_per_thread_per_radix[2] = 10;
18363 registers_per_thread_per_radix[3] = 0;
18364 registers_per_thread_per_radix[5] = 10;
18365 registers_per_thread_per_radix[7] = 0;
18366 registers_per_thread_per_radix[11] = 0;
18367 registers_per_thread_per_radix[13] = 0;
18370 registers_per_thread_per_radix[2] = 10;
18371 registers_per_thread_per_radix[3] = 0;
18372 registers_per_thread_per_radix[5] = 10;
18373 registers_per_thread_per_radix[7] = 0;
18374 registers_per_thread_per_radix[11] = 0;
18375 registers_per_thread_per_radix[13] = 0;
18378 registers_per_thread_per_radix[2] = 8;
18379 registers_per_thread_per_radix[3] = 0;
18380 registers_per_thread_per_radix[5] = 10;
18381 registers_per_thread_per_radix[7] = 0;
18382 registers_per_thread_per_radix[11] = 0;
18383 registers_per_thread_per_radix[13] = 0;
18392 if (loc_multipliers[7] > 0) {
18393 if (loc_multipliers[11] > 0) {
18394 if (loc_multipliers[13] > 0) {
18395 switch (loc_multipliers[2]) {
18397 registers_per_thread_per_radix[2] = 14;
18398 registers_per_thread_per_radix[3] = 0;
18399 registers_per_thread_per_radix[5] = 0;
18400 registers_per_thread_per_radix[7] = 14;
18401 registers_per_thread_per_radix[11] = 11;
18402 registers_per_thread_per_radix[13] = 13;
18405 registers_per_thread_per_radix[2] = 14;
18406 registers_per_thread_per_radix[3] = 0;
18407 registers_per_thread_per_radix[5] = 0;
18408 registers_per_thread_per_radix[7] = 14;
18409 registers_per_thread_per_radix[11] = 11;
18410 registers_per_thread_per_radix[13] = 13;
18413 registers_per_thread_per_radix[2] = 8;
18414 registers_per_thread_per_radix[3] = 0;
18415 registers_per_thread_per_radix[5] = 0;
18416 registers_per_thread_per_radix[7] = 14;
18417 registers_per_thread_per_radix[11] = 11;
18418 registers_per_thread_per_radix[13] = 13;
18421 registers_per_thread_per_radix[2] = 16;
18422 registers_per_thread_per_radix[3] = 0;
18423 registers_per_thread_per_radix[5] = 0;
18424 registers_per_thread_per_radix[7] = 14;
18425 registers_per_thread_per_radix[11] = 11;
18426 registers_per_thread_per_radix[13] = 13;
18431 switch (loc_multipliers[2]) {
18433 registers_per_thread_per_radix[2] = 14;
18434 registers_per_thread_per_radix[3] = 0;
18435 registers_per_thread_per_radix[5] = 0;
18436 registers_per_thread_per_radix[7] = 14;
18437 registers_per_thread_per_radix[11] = 11;
18438 registers_per_thread_per_radix[13] = 0;
18441 registers_per_thread_per_radix[2] = 14;
18442 registers_per_thread_per_radix[3] = 0;
18443 registers_per_thread_per_radix[5] = 0;
18444 registers_per_thread_per_radix[7] = 14;
18445 registers_per_thread_per_radix[11] = 11;
18446 registers_per_thread_per_radix[13] = 0;
18449 registers_per_thread_per_radix[2] = 8;
18450 registers_per_thread_per_radix[3] = 0;
18451 registers_per_thread_per_radix[5] = 0;
18452 registers_per_thread_per_radix[7] = 14;
18453 registers_per_thread_per_radix[11] = 11;
18454 registers_per_thread_per_radix[13] = 0;
18457 registers_per_thread_per_radix[2] = 16;
18458 registers_per_thread_per_radix[3] = 0;
18459 registers_per_thread_per_radix[5] = 0;
18460 registers_per_thread_per_radix[7] = 14;
18461 registers_per_thread_per_radix[11] = 11;
18462 registers_per_thread_per_radix[13] = 0;
18468 if (loc_multipliers[13] > 0) {
18469 switch (loc_multipliers[2]) {
18471 registers_per_thread_per_radix[2] = 14;
18472 registers_per_thread_per_radix[3] = 0;
18473 registers_per_thread_per_radix[5] = 0;
18474 registers_per_thread_per_radix[7] = 14;
18475 registers_per_thread_per_radix[11] = 0;
18476 registers_per_thread_per_radix[13] = 13;
18479 registers_per_thread_per_radix[2] = 14;
18480 registers_per_thread_per_radix[3] = 0;
18481 registers_per_thread_per_radix[5] = 0;
18482 registers_per_thread_per_radix[7] = 14;
18483 registers_per_thread_per_radix[11] = 0;
18484 registers_per_thread_per_radix[13] = 13;
18487 registers_per_thread_per_radix[2] = 8;
18488 registers_per_thread_per_radix[3] = 0;
18489 registers_per_thread_per_radix[5] = 0;
18490 registers_per_thread_per_radix[7] = 14;
18491 registers_per_thread_per_radix[11] = 0;
18492 registers_per_thread_per_radix[13] = 13;
18495 registers_per_thread_per_radix[2] = 16;
18496 registers_per_thread_per_radix[3] = 0;
18497 registers_per_thread_per_radix[5] = 0;
18498 registers_per_thread_per_radix[7] = 14;
18499 registers_per_thread_per_radix[11] = 0;
18500 registers_per_thread_per_radix[13] = 13;
18505 switch (loc_multipliers[2]) {
18507 registers_per_thread_per_radix[2] = 14;
18508 registers_per_thread_per_radix[3] = 0;
18509 registers_per_thread_per_radix[5] = 0;
18510 registers_per_thread_per_radix[7] = 14;
18511 registers_per_thread_per_radix[11] = 0;
18512 registers_per_thread_per_radix[13] = 0;
18515 registers_per_thread_per_radix[2] = 14;
18516 registers_per_thread_per_radix[3] = 0;
18517 registers_per_thread_per_radix[5] = 0;
18518 registers_per_thread_per_radix[7] = 14;
18519 registers_per_thread_per_radix[11] = 0;
18520 registers_per_thread_per_radix[13] = 0;
18523 registers_per_thread_per_radix[2] = 14;
18524 registers_per_thread_per_radix[3] = 0;
18525 registers_per_thread_per_radix[5] = 0;
18526 registers_per_thread_per_radix[7] = 14;
18527 registers_per_thread_per_radix[11] = 0;
18528 registers_per_thread_per_radix[13] = 0;
18531 registers_per_thread_per_radix[2] = 14;
18532 registers_per_thread_per_radix[3] = 0;
18533 registers_per_thread_per_radix[5] = 0;
18534 registers_per_thread_per_radix[7] = 14;
18535 registers_per_thread_per_radix[11] = 0;
18536 registers_per_thread_per_radix[13] = 0;
18543 if (loc_multipliers[11] > 0) {
18544 if (loc_multipliers[13] > 0) {
18545 switch (loc_multipliers[2]) {
18547 registers_per_thread_per_radix[2] = 22;
18548 registers_per_thread_per_radix[3] = 0;
18549 registers_per_thread_per_radix[5] = 0;
18550 registers_per_thread_per_radix[7] = 0;
18551 registers_per_thread_per_radix[11] = 22;
18552 registers_per_thread_per_radix[13] = 26;
18555 registers_per_thread_per_radix[2] = 22;
18556 registers_per_thread_per_radix[3] = 0;
18557 registers_per_thread_per_radix[5] = 0;
18558 registers_per_thread_per_radix[7] = 0;
18559 registers_per_thread_per_radix[11] = 22;
18560 registers_per_thread_per_radix[13] = 26;
18563 registers_per_thread_per_radix[2] = 8;
18564 registers_per_thread_per_radix[3] = 0;
18565 registers_per_thread_per_radix[5] = 0;
18566 registers_per_thread_per_radix[7] = 0;
18567 registers_per_thread_per_radix[11] = 11;
18568 registers_per_thread_per_radix[13] = 13;
18573 switch (loc_multipliers[2]) {
18575 registers_per_thread_per_radix[2] = 22;
18576 registers_per_thread_per_radix[3] = 0;
18577 registers_per_thread_per_radix[5] = 0;
18578 registers_per_thread_per_radix[7] = 0;
18579 registers_per_thread_per_radix[11] = 22;
18580 registers_per_thread_per_radix[13] = 0;
18583 registers_per_thread_per_radix[2] = 22;
18584 registers_per_thread_per_radix[3] = 0;
18585 registers_per_thread_per_radix[5] = 0;
18586 registers_per_thread_per_radix[7] = 0;
18587 registers_per_thread_per_radix[11] = 22;
18588 registers_per_thread_per_radix[13] = 0;
18591 registers_per_thread_per_radix[2] = 8;
18592 registers_per_thread_per_radix[3] = 0;
18593 registers_per_thread_per_radix[5] = 0;
18594 registers_per_thread_per_radix[7] = 0;
18595 registers_per_thread_per_radix[11] = 11;
18596 registers_per_thread_per_radix[13] = 0;
18599 registers_per_thread_per_radix[2] = 8;
18600 registers_per_thread_per_radix[3] = 0;
18601 registers_per_thread_per_radix[5] = 0;
18602 registers_per_thread_per_radix[7] = 0;
18603 registers_per_thread_per_radix[11] = 11;
18604 registers_per_thread_per_radix[13] = 0;
18610 if (loc_multipliers[13] > 0) {
18611 switch (loc_multipliers[2]) {
18613 registers_per_thread_per_radix[2] = 26;
18614 registers_per_thread_per_radix[3] = 0;
18615 registers_per_thread_per_radix[5] = 0;
18616 registers_per_thread_per_radix[7] = 0;
18617 registers_per_thread_per_radix[11] = 0;
18618 registers_per_thread_per_radix[13] = 26;
18621 registers_per_thread_per_radix[2] = 26;
18622 registers_per_thread_per_radix[3] = 0;
18623 registers_per_thread_per_radix[5] = 0;
18624 registers_per_thread_per_radix[7] = 0;
18625 registers_per_thread_per_radix[11] = 0;
18626 registers_per_thread_per_radix[13] = 26;
18629 registers_per_thread_per_radix[2] = 8;
18630 registers_per_thread_per_radix[3] = 0;
18631 registers_per_thread_per_radix[5] = 0;
18632 registers_per_thread_per_radix[7] = 0;
18633 registers_per_thread_per_radix[11] = 0;
18634 registers_per_thread_per_radix[13] = 13;
18639 registers_per_thread_per_radix[2] = (loc_multipliers[2] > 2) ? 8 : (uint64_t)pow(2, loc_multipliers[2]);
18640 registers_per_thread_per_radix[3] = 0;
18641 registers_per_thread_per_radix[5] = 0;
18642 registers_per_thread_per_radix[7] = 0;
18643 registers_per_thread_per_radix[11] = 0;
18644 registers_per_thread_per_radix[13] = 0;
18652 if (loc_multipliers[3] > 0) {
18653 if (loc_multipliers[5] > 0) {
18654 if (loc_multipliers[7] > 0) {
18655 if (loc_multipliers[11] > 0) {
18656 if (loc_multipliers[13] > 0) {
18657 registers_per_thread_per_radix[2] = 0;
18658 registers_per_thread_per_radix[3] = 15;
18659 registers_per_thread_per_radix[5] = 15;
18660 registers_per_thread_per_radix[7] = 21;
18661 registers_per_thread_per_radix[11] = 11;
18662 registers_per_thread_per_radix[13] = 13;
18665 registers_per_thread_per_radix[2] = 0;
18666 registers_per_thread_per_radix[3] = 15;
18667 registers_per_thread_per_radix[5] = 15;
18668 registers_per_thread_per_radix[7] = 21;
18669 registers_per_thread_per_radix[11] = 11;
18670 registers_per_thread_per_radix[13] = 0;
18674 if (loc_multipliers[13] > 0) {
18675 registers_per_thread_per_radix[2] = 0;
18676 registers_per_thread_per_radix[3] = 15;
18677 registers_per_thread_per_radix[5] = 15;
18678 registers_per_thread_per_radix[7] = 21;
18679 registers_per_thread_per_radix[11] = 0;
18680 registers_per_thread_per_radix[13] = 13;
18683 registers_per_thread_per_radix[2] = 0;
18684 registers_per_thread_per_radix[3] = 15;
18685 registers_per_thread_per_radix[5] = 15;
18686 registers_per_thread_per_radix[7] = 21;
18687 registers_per_thread_per_radix[11] = 0;
18688 registers_per_thread_per_radix[13] = 0;
18693 if (loc_multipliers[11] > 0) {
18694 if (loc_multipliers[13] > 0) {
18695 registers_per_thread_per_radix[2] = 0;
18696 registers_per_thread_per_radix[3] = 15;
18697 registers_per_thread_per_radix[5] = 15;
18698 registers_per_thread_per_radix[7] = 0;
18699 registers_per_thread_per_radix[11] = 11;
18700 registers_per_thread_per_radix[13] = 13;
18703 registers_per_thread_per_radix[2] = 0;
18704 registers_per_thread_per_radix[3] = 15;
18705 registers_per_thread_per_radix[5] = 15;
18706 registers_per_thread_per_radix[7] = 0;
18707 registers_per_thread_per_radix[11] = 11;
18708 registers_per_thread_per_radix[13] = 0;
18712 if (loc_multipliers[13] > 0) {
18713 registers_per_thread_per_radix[2] = 0;
18714 registers_per_thread_per_radix[3] = 15;
18715 registers_per_thread_per_radix[5] = 15;
18716 registers_per_thread_per_radix[7] = 0;
18717 registers_per_thread_per_radix[11] = 0;
18718 registers_per_thread_per_radix[13] = 13;
18721 registers_per_thread_per_radix[2] = 0;
18722 registers_per_thread_per_radix[3] = 15;
18723 registers_per_thread_per_radix[5] = 15;
18724 registers_per_thread_per_radix[7] = 0;
18725 registers_per_thread_per_radix[11] = 0;
18726 registers_per_thread_per_radix[13] = 0;
18733 if (loc_multipliers[7] > 0) {
18734 if (loc_multipliers[3] == 1) {
18735 if (loc_multipliers[11] > 0) {
18736 if (loc_multipliers[13] > 0) {
18737 registers_per_thread_per_radix[2] = 0;
18738 registers_per_thread_per_radix[3] = 21;
18739 registers_per_thread_per_radix[5] = 0;
18740 registers_per_thread_per_radix[7] = 21;
18741 registers_per_thread_per_radix[11] = 11;
18742 registers_per_thread_per_radix[13] = 13;
18745 registers_per_thread_per_radix[2] = 0;
18746 registers_per_thread_per_radix[3] = 21;
18747 registers_per_thread_per_radix[5] = 0;
18748 registers_per_thread_per_radix[7] = 21;
18749 registers_per_thread_per_radix[11] = 11;
18750 registers_per_thread_per_radix[13] = 0;
18754 if (loc_multipliers[13] > 0) {
18755 registers_per_thread_per_radix[2] = 0;
18756 registers_per_thread_per_radix[3] = 21;
18757 registers_per_thread_per_radix[5] = 0;
18758 registers_per_thread_per_radix[7] = 21;
18759 registers_per_thread_per_radix[11] = 0;
18760 registers_per_thread_per_radix[13] = 13;
18763 registers_per_thread_per_radix[2] = 0;
18764 registers_per_thread_per_radix[3] = 21;
18765 registers_per_thread_per_radix[5] = 0;
18766 registers_per_thread_per_radix[7] = 21;
18767 registers_per_thread_per_radix[11] = 0;
18768 registers_per_thread_per_radix[13] = 0;
18773 if (loc_multipliers[11] > 0) {
18774 if (loc_multipliers[13] > 0) {
18775 registers_per_thread_per_radix[2] = 0;
18776 registers_per_thread_per_radix[3] = 9;
18777 registers_per_thread_per_radix[5] = 0;
18778 registers_per_thread_per_radix[7] = 7;
18779 registers_per_thread_per_radix[11] = 11;
18780 registers_per_thread_per_radix[13] = 13;
18783 registers_per_thread_per_radix[2] = 0;
18784 registers_per_thread_per_radix[3] = 9;
18785 registers_per_thread_per_radix[5] = 0;
18786 registers_per_thread_per_radix[7] = 7;
18787 registers_per_thread_per_radix[11] = 11;
18788 registers_per_thread_per_radix[13] = 0;
18792 if (loc_multipliers[13] > 0) {
18793 registers_per_thread_per_radix[2] = 0;
18794 registers_per_thread_per_radix[3] = 9;
18795 registers_per_thread_per_radix[5] = 0;
18796 registers_per_thread_per_radix[7] = 7;
18797 registers_per_thread_per_radix[11] = 0;
18798 registers_per_thread_per_radix[13] = 13;
18801 registers_per_thread_per_radix[2] = 0;
18802 registers_per_thread_per_radix[3] = 9;
18803 registers_per_thread_per_radix[5] = 0;
18804 registers_per_thread_per_radix[7] = 7;
18805 registers_per_thread_per_radix[11] = 0;
18806 registers_per_thread_per_radix[13] = 0;
18812 if (loc_multipliers[3] == 1) {
18813 if (loc_multipliers[11] > 0) {
18814 if (loc_multipliers[13] > 0) {
18815 registers_per_thread_per_radix[2] = 0;
18816 registers_per_thread_per_radix[3] = 33;
18817 registers_per_thread_per_radix[5] = 0;
18818 registers_per_thread_per_radix[7] = 0;
18819 registers_per_thread_per_radix[11] = 33;
18820 registers_per_thread_per_radix[13] = 39;
18823 registers_per_thread_per_radix[2] = 0;
18824 registers_per_thread_per_radix[3] = 33;
18825 registers_per_thread_per_radix[5] = 0;
18826 registers_per_thread_per_radix[7] = 0;
18827 registers_per_thread_per_radix[11] = 33;
18828 registers_per_thread_per_radix[13] = 0;
18832 if (loc_multipliers[13] > 0) {
18833 registers_per_thread_per_radix[2] = 0;
18834 registers_per_thread_per_radix[3] = 39;
18835 registers_per_thread_per_radix[5] = 0;
18836 registers_per_thread_per_radix[7] = 0;
18837 registers_per_thread_per_radix[11] = 0;
18838 registers_per_thread_per_radix[13] = 39;
18841 registers_per_thread_per_radix[2] = 0;
18842 registers_per_thread_per_radix[3] = 3;
18843 registers_per_thread_per_radix[5] = 0;
18844 registers_per_thread_per_radix[7] = 0;
18845 registers_per_thread_per_radix[11] = 0;
18846 registers_per_thread_per_radix[13] = 0;
18851 if (loc_multipliers[11] > 0) {
18852 if (loc_multipliers[13] > 0) {
18853 registers_per_thread_per_radix[2] = 0;
18854 registers_per_thread_per_radix[3] = 9;
18855 registers_per_thread_per_radix[5] = 0;
18856 registers_per_thread_per_radix[7] = 0;
18857 registers_per_thread_per_radix[11] = 11;
18858 registers_per_thread_per_radix[13] = 13;
18861 registers_per_thread_per_radix[2] = 0;
18862 registers_per_thread_per_radix[3] = 9;
18863 registers_per_thread_per_radix[5] = 0;
18864 registers_per_thread_per_radix[7] = 0;
18865 registers_per_thread_per_radix[11] = 11;
18866 registers_per_thread_per_radix[13] = 0;
18870 if (loc_multipliers[13] > 0) {
18871 registers_per_thread_per_radix[2] = 0;
18872 registers_per_thread_per_radix[3] = 9;
18873 registers_per_thread_per_radix[5] = 0;
18874 registers_per_thread_per_radix[7] = 0;
18875 registers_per_thread_per_radix[11] = 0;
18876 registers_per_thread_per_radix[13] = 13;
18879 registers_per_thread_per_radix[2] = 0;
18880 registers_per_thread_per_radix[3] = 9;
18881 registers_per_thread_per_radix[5] = 0;
18882 registers_per_thread_per_radix[7] = 0;
18883 registers_per_thread_per_radix[11] = 0;
18884 registers_per_thread_per_radix[13] = 0;
18892 if (loc_multipliers[5] > 0) {
18893 if (loc_multipliers[7] > 0) {
18894 if (loc_multipliers[11] > 0) {
18895 if (loc_multipliers[13] > 0) {
18896 registers_per_thread_per_radix[2] = 0;
18897 registers_per_thread_per_radix[3] = 0;
18898 registers_per_thread_per_radix[5] = 5;
18899 registers_per_thread_per_radix[7] = 7;
18900 registers_per_thread_per_radix[11] = 11;
18901 registers_per_thread_per_radix[13] = 13;
18904 registers_per_thread_per_radix[2] = 0;
18905 registers_per_thread_per_radix[3] = 0;
18906 registers_per_thread_per_radix[5] = 5;
18907 registers_per_thread_per_radix[7] = 7;
18908 registers_per_thread_per_radix[11] = 11;
18909 registers_per_thread_per_radix[13] = 0;
18913 if (loc_multipliers[13] > 0) {
18914 registers_per_thread_per_radix[2] = 0;
18915 registers_per_thread_per_radix[3] = 0;
18916 registers_per_thread_per_radix[5] = 5;
18917 registers_per_thread_per_radix[7] = 7;
18918 registers_per_thread_per_radix[11] = 0;
18919 registers_per_thread_per_radix[13] = 13;
18922 registers_per_thread_per_radix[2] = 0;
18923 registers_per_thread_per_radix[3] = 0;
18924 registers_per_thread_per_radix[5] = 5;
18925 registers_per_thread_per_radix[7] = 7;
18926 registers_per_thread_per_radix[11] = 0;
18927 registers_per_thread_per_radix[13] = 0;
18932 if (loc_multipliers[11] > 0) {
18933 if (loc_multipliers[13] > 0) {
18934 registers_per_thread_per_radix[2] = 0;
18935 registers_per_thread_per_radix[3] = 0;
18936 registers_per_thread_per_radix[5] = 5;
18937 registers_per_thread_per_radix[7] = 0;
18938 registers_per_thread_per_radix[11] = 11;
18939 registers_per_thread_per_radix[13] = 13;
18942 registers_per_thread_per_radix[2] = 0;
18943 registers_per_thread_per_radix[3] = 0;
18944 registers_per_thread_per_radix[5] = 5;
18945 registers_per_thread_per_radix[7] = 0;
18946 registers_per_thread_per_radix[11] = 11;
18947 registers_per_thread_per_radix[13] = 0;
18951 if (loc_multipliers[13] > 0) {
18952 registers_per_thread_per_radix[2] = 0;
18953 registers_per_thread_per_radix[3] = 0;
18954 registers_per_thread_per_radix[5] = 5;
18955 registers_per_thread_per_radix[7] = 0;
18956 registers_per_thread_per_radix[11] = 0;
18957 registers_per_thread_per_radix[13] = 13;
18960 registers_per_thread_per_radix[2] = 0;
18961 registers_per_thread_per_radix[3] = 0;
18962 registers_per_thread_per_radix[5] = 5;
18963 registers_per_thread_per_radix[7] = 0;
18964 registers_per_thread_per_radix[11] = 0;
18965 registers_per_thread_per_radix[13] = 0;
18972 if (loc_multipliers[7] > 0) {
18973 if (loc_multipliers[11] > 0) {
18974 if (loc_multipliers[13] > 0) {
18975 registers_per_thread_per_radix[2] = 0;
18976 registers_per_thread_per_radix[3] = 0;
18977 registers_per_thread_per_radix[5] = 0;
18978 registers_per_thread_per_radix[7] = 7;
18979 registers_per_thread_per_radix[11] = 11;
18980 registers_per_thread_per_radix[13] = 13;
18983 registers_per_thread_per_radix[2] = 0;
18984 registers_per_thread_per_radix[3] = 0;
18985 registers_per_thread_per_radix[5] = 0;
18986 registers_per_thread_per_radix[7] = 7;
18987 registers_per_thread_per_radix[11] = 11;
18988 registers_per_thread_per_radix[13] = 0;
18992 if (loc_multipliers[13] > 0) {
18993 registers_per_thread_per_radix[2] = 0;
18994 registers_per_thread_per_radix[3] = 0;
18995 registers_per_thread_per_radix[5] = 0;
18996 registers_per_thread_per_radix[7] = 7;
18997 registers_per_thread_per_radix[11] = 0;
18998 registers_per_thread_per_radix[13] = 13;
19001 registers_per_thread_per_radix[2] = 0;
19002 registers_per_thread_per_radix[3] = 0;
19003 registers_per_thread_per_radix[5] = 0;
19004 registers_per_thread_per_radix[7] = 7;
19005 registers_per_thread_per_radix[11] = 0;
19006 registers_per_thread_per_radix[13] = 0;
19011 if (loc_multipliers[11] > 0) {
19012 if (loc_multipliers[13] > 0) {
19013 registers_per_thread_per_radix[2] = 0;
19014 registers_per_thread_per_radix[3] = 0;
19015 registers_per_thread_per_radix[5] = 0;
19016 registers_per_thread_per_radix[7] = 0;
19017 registers_per_thread_per_radix[11] = 11;
19018 registers_per_thread_per_radix[13] = 13;
19021 registers_per_thread_per_radix[2] = 0;
19022 registers_per_thread_per_radix[3] = 0;
19023 registers_per_thread_per_radix[5] = 0;
19024 registers_per_thread_per_radix[7] = 0;
19025 registers_per_thread_per_radix[11] = 11;
19026 registers_per_thread_per_radix[13] = 0;
19030 if (loc_multipliers[13] > 0) {
19031 registers_per_thread_per_radix[2] = 0;
19032 registers_per_thread_per_radix[3] = 0;
19033 registers_per_thread_per_radix[5] = 0;
19034 registers_per_thread_per_radix[7] = 0;
19035 registers_per_thread_per_radix[11] = 0;
19036 registers_per_thread_per_radix[13] = 13;
19047 for (uint64_t i = 0; i < 14; i++) {
19048 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i];
19049 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i];
19051 if ((registers_per_thread[0] > 10) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0;
19052 else isGoodSequence[0] = 1;
19059 uint64_t complexSize;
19061 complexSize = (2 *
sizeof(double));
19064 complexSize = (2 *
sizeof(float));
19066 complexSize = (2 *
sizeof(float));
19068 uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19070 for (uint64_t i = 0; i < 3; i++) {
19089 if (axis_id != nonStridedAxisId) {
19093 uint64_t multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
19095 for (uint64_t i = 2; i < 14; i++) {
19096 if (tempSequence % i == 0) {
19102 if (tempSequence != 1) {
19104 if (axis_id != nonStridedAxisId) {
19110 uint64_t FFTSizeSelected = 0;
19112 while (!FFTSizeSelected) {
19113 uint64_t testSequence = tempSequence;
19114 for (uint64_t i = 0; i < 20; i++) {
19115 multipliers[i] = 0;
19118 if (testSequence % i == 0) {
19124 if (testSequence == 1) FFTSizeSelected = 1;
19125 else tempSequence++;
19129 while (!FFTSizeSelected) {
19130 if (axis_id == nonStridedAxisId) {
19131 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19135 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19137 uint64_t testSequence = tempSequence;
19138 for (uint64_t i = 0; i < 20; i++) {
19139 multipliers[i] = 0;
19141 for (uint64_t i = 2; i < 8; i++) {
19142 if (testSequence % i == 0) {
19148 if (testSequence != 1) tempSequence++;
19150 uint64_t registers_per_thread_per_radix[14];
19151 uint64_t registers_per_thread = 0;
19152 uint64_t min_registers_per_thread = -1;
19153 uint64_t isGoodSequence = 0;
19154 res =
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19156 if (isGoodSequence) FFTSizeSelected = 1;
19157 else tempSequence++;
19168 uint64_t FFTSizeSelected = 0;
19170 while (!FFTSizeSelected) {
19171 uint64_t testSequence = tempSequence;
19172 for (uint64_t i = 0; i < 20; i++) {
19173 multipliers[i] = 0;
19176 if (testSequence % i == 0) {
19182 if (testSequence == 1) FFTSizeSelected = 1;
19183 else tempSequence++;
19187 while (!FFTSizeSelected) {
19188 if (axis_id == nonStridedAxisId) {
19189 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19193 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence)));
19195 uint64_t testSequence = tempSequence;
19196 for (uint64_t i = 0; i < 20; i++) {
19197 multipliers[i] = 0;
19199 for (uint64_t i = 2; i < 8; i++) {
19200 if (testSequence % i == 0) {
19206 if (testSequence != 1) tempSequence++;
19208 uint64_t registers_per_thread_per_radix[14];
19209 uint64_t registers_per_thread = 0;
19210 uint64_t min_registers_per_thread = -1;
19211 uint64_t isGoodSequence = 0;
19212 res =
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19214 if (isGoodSequence) FFTSizeSelected = 1;
19215 else tempSequence++;
19224 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19248 uint64_t registerBoost = 1;
19255 uint64_t maxSingleSizeStrided = (!app->
configuration.
performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided;
19256 uint64_t numPasses = 1;
19257 uint64_t numPassesHalfBandwidth = 1;
19259 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStrided);
19270 numPasses = (uint64_t)ceil(log2(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
19272 numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided));
19274 registerBoost = ((axis_id == nonStridedAxisId) && ((app->
useBluesteinFFT[axis_id]) || (!app->
configuration.
reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)pow(maxSequenceLengthSharedMemoryStrided, numPasses));
19275 uint64_t canBoost = 0;
19287 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost;
19288 maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
19289 uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19292 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStridedHalfBandwidth);
19296 for (uint64_t i = 0; i < 5; i++) {
19297 temp = (uint64_t)ceil(temp / (
double)maxSingleSizeStrided);
19298 numPassesHalfBandwidth++;
19299 if (temp == 1) i = 5;
19311 if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth;
19312 else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19315 uint64_t* locAxisSplit = FFTPlan->
axisSplit[axis_id];
19316 if (numPasses == 1) {
19319 if (numPasses == 2) {
19322 uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19324 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) {
19325 locAxisSplit[0] = maxPow8SharedMemory;
19328 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) {
19329 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19332 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) {
19333 for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) {
19334 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided) {
19335 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
19336 i = (uint64_t)log2(registerBoost) + 1;
19341 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19347 uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19349 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) {
19350 locAxisSplit[0] = maxPow8Strided;
19353 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) {
19354 locAxisSplit[0] = maxSingleSizeStrided;
19357 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19362 if (locAxisSplit[1] < 64) {
19363 locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]);
19364 locAxisSplit[1] = 64;
19366 if (locAxisSplit[1] > locAxisSplit[0]) {
19367 uint64_t swap = locAxisSplit[0];
19368 locAxisSplit[0] = locAxisSplit[1];
19369 locAxisSplit[1] = swap;
19373 uint64_t successSplit = 0;
19385 uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]));
19386 for (uint64_t i = 0; i < sqrtSequence; i++) {
19388 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) {
19390 locAxisSplit[1] = sqrtSequence - i;
19398 uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]));
19399 for (uint64_t i = 0; i < sqrtSequence; i++) {
19401 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) {
19403 locAxisSplit[1] = sqrtSequence - i;
19410 if (successSplit == 0)
19414 if (numPasses == 3) {
19416 uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19419 uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19420 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided)
19421 locAxisSplit[0] = maxPow8SharedMemory;
19423 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided)
19424 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19426 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19427 for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) {
19428 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19429 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i));
19430 i = (uint64_t)log2(registerBoost) + 1;
19435 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19449 uint64_t maxPow8_128 = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3);
19451 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided)
19452 locAxisSplit[0] = maxPow8_128;
19456 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) {
19457 locAxisSplit[0] = maxPow8_128 * 2;
19460 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) {
19461 locAxisSplit[0] = maxPow8_128 * 4;
19464 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) {
19465 for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) {
19466 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19467 locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)pow(2, i));
19468 i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1;
19473 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19478 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) {
19479 locAxisSplit[1] = maxPow8Strided;
19480 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19483 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) {
19484 locAxisSplit[1] = maxSingleSizeStrided;
19485 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19488 locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth;
19489 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19492 if (locAxisSplit[2] < 64) {
19493 locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]);
19494 locAxisSplit[2] = 64;
19496 if (locAxisSplit[2] > locAxisSplit[1]) {
19497 uint64_t swap = locAxisSplit[1];
19498 locAxisSplit[1] = locAxisSplit[2];
19499 locAxisSplit[2] = swap;
19503 uint64_t successSplit = 0;
19505 for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) {
19506 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
19507 uint64_t sqrt3Sequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
19508 for (uint64_t j = 0; j < sqrt3Sequence; j++) {
19509 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) {
19510 if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) {
19511 locAxisSplit[0] = (maxSequenceLengthSharedMemory - i);
19512 locAxisSplit[1] = sqrt3Sequence - j;
19513 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j);
19514 i = maxSequenceLengthSharedMemory;
19524 uint64_t sqrt3Sequence = (uint64_t)ceil(pow(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0));
19525 for (uint64_t i = 0; i < sqrt3Sequence; i++) {
19527 uint64_t sqrt2Sequence = (uint64_t)ceil(sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
19528 for (uint64_t j = 0; j < sqrt2Sequence; j++) {
19529 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) {
19530 if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) {
19531 locAxisSplit[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j);
19532 locAxisSplit[1] = sqrt3Sequence - i;
19533 locAxisSplit[2] = sqrt2Sequence - j;
19543 if (successSplit == 0)
19547 if (numPasses > 3) {
19560 for (uint64_t i = 0; i < numPasses; i++) {
19561 if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) {
19562 uint64_t swap = locAxisSplit[0];
19563 locAxisSplit[0] = locAxisSplit[i];
19564 locAxisSplit[i] = swap;
19567 for (uint64_t i = 0; i < numPasses; i++) {
19568 if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) {
19569 uint64_t swap = locAxisSplit[0];
19570 locAxisSplit[0] = locAxisSplit[i];
19571 locAxisSplit[i] = swap;
19574 for (uint64_t i = 0; i < numPasses; i++) {
19575 if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) {
19576 uint64_t swap = locAxisSplit[0];
19577 locAxisSplit[0] = locAxisSplit[i];
19578 locAxisSplit[i] = swap;
19583 for (uint64_t k = 0; k < numPasses; k++) {
19584 tempSequence = locAxisSplit[k];
19585 uint64_t loc_multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
19586 for (uint64_t i = 2; i < 14; i++) {
19587 if (tempSequence % i == 0) {
19589 loc_multipliers[i]++;
19593 uint64_t registers_per_thread_per_radix[14];
19594 uint64_t registers_per_thread = 0;
19595 uint64_t min_registers_per_thread = -1;
19596 uint64_t isGoodSequence = 0;
19597 res =
VkFFTGetRegistersPerThread(loc_multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19599 registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2];
19600 registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2];
19601 if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
19602 registers_per_thread *= 2;
19603 for (uint64_t i = 2; i < 14; i++) {
19604 registers_per_thread_per_radix[i] *= 2;
19606 min_registers_per_thread *= 2;
19608 if (registers_per_thread_per_radix[8] % 8 == 0) {
19609 loc_multipliers[8] = loc_multipliers[2] / 3;
19610 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
19612 if (registers_per_thread_per_radix[4] % 4 == 0) {
19613 loc_multipliers[4] = loc_multipliers[2] / 2;
19614 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
19616 if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
19617 if (loc_multipliers[4] > 0) {
19618 loc_multipliers[4]--;
19619 loc_multipliers[2] = 2;
19622 loc_multipliers[8]--;
19623 loc_multipliers[4]++;
19624 loc_multipliers[2]++;
19627 if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
19628 loc_multipliers[8]--;
19629 loc_multipliers[4]++;
19630 loc_multipliers[2]++;
19635 uint64_t scaleRegistersNum = 1;
19636 while ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->
configuration.
maxThreadsNum) {
19637 for (uint64_t i = 2; i < 14; i++) {
19638 if (locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum) % i == 0) {
19639 scaleRegistersNum *= i;
19644 min_registers_per_thread *= scaleRegistersNum;
19645 uint64_t temp_scaleRegistersNum = scaleRegistersNum;
19646 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19647 registers_per_thread *= temp_scaleRegistersNum;
19648 for (uint64_t i = 2; i < 14; i++) {
19649 if (registers_per_thread_per_radix[i] != 0) {
19650 temp_scaleRegistersNum = scaleRegistersNum;
19651 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread_per_radix[i] * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19652 registers_per_thread_per_radix[i] *= temp_scaleRegistersNum;
19656 if (min_registers_per_thread > registers_per_thread) {
19657 uint64_t temp = min_registers_per_thread;
19658 min_registers_per_thread = registers_per_thread;
19659 registers_per_thread = temp;
19661 for (uint64_t i = 2; i < 14; i++) {
19662 if (registers_per_thread_per_radix[i] > registers_per_thread) {
19663 registers_per_thread = registers_per_thread_per_radix[i];
19665 if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) {
19666 min_registers_per_thread = registers_per_thread_per_radix[i];
19674 for (uint64_t i = 2; i < 14; i++) {
19679 uint64_t tempRegisterBoost = registerBoost;
19680 uint64_t switchRegisterBoost = 0;
19681 if (tempRegisterBoost > 1) {
19682 if (loc_multipliers[tempRegisterBoost] > 0) {
19683 loc_multipliers[tempRegisterBoost]--;
19684 switchRegisterBoost = tempRegisterBoost;
19687 for (uint64_t i = 14; i > 1; i--) {
19688 if (loc_multipliers[i] > 0) {
19689 loc_multipliers[i]--;
19690 switchRegisterBoost = i;
19696 for (uint64_t i = 14; i > 1; i--) {
19697 if (loc_multipliers[i] > 0) {
19699 loc_multipliers[i]--;
19705 if (switchRegisterBoost > 0) {
19710 if (min_registers_per_thread != registers_per_thread) {
19726 double double_PI = 3.1415926535897932384626433832795;
19731 kernelPreparationConfiguration.
FFTdim = 1;
19733 kernelPreparationConfiguration.
size[1] = 1;
19734 kernelPreparationConfiguration.
size[2] = 1;
19736 kernelPreparationConfiguration.
useLUT = 1;
19749#if(VKFFT_BACKEND==0)
19756#elif(VKFFT_BACKEND==3)
19757 kernelPreparationConfiguration.platform = app->
configuration.platform;
19758 kernelPreparationConfiguration.context = app->
configuration.context;
19761 uint64_t bufferSize = (uint64_t)
sizeof(
float) * 2 * kernelPreparationConfiguration.
size[0] * kernelPreparationConfiguration.
size[1] * kernelPreparationConfiguration.
size[2];
19762 if (kernelPreparationConfiguration.
doublePrecision) bufferSize *=
sizeof(double) /
sizeof(
float);
19768 resFFT =
initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration);
19771#if(VKFFT_BACKEND==0)
19772 VkResult res = VK_SUCCESS;
19783#elif(VKFFT_BACKEND==1)
19784 cudaError_t res = cudaSuccess;
19785 res = cudaMalloc((
void**)&app->
bufferBluestein[axis_id], bufferSize);
19795#elif(VKFFT_BACKEND==2)
19796 hipError_t res = hipSuccess;
19807#elif(VKFFT_BACKEND==3)
19808 cl_int res = CL_SUCCESS;
19822 void* phaseVectors = malloc(bufferSize);
19823 if (!phaseVectors) {
19832 double* phaseVectors_cast = (
double*)phaseVectors;
19834 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19835 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19836 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)cos(angle) : 0;
19837 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)-sin(angle) : 0;
19839 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19840 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19841 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19845 float* phaseVectors_cast = (
float*)phaseVectors;
19847 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19848 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19849 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)cos(angle) : 0;
19850 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)-sin(angle) : 0;
19852 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19853 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19854 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19857#if(VKFFT_BACKEND==0)
19860 free(phaseVectors);
19864#elif(VKFFT_BACKEND==1)
19865 res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
19866 if (res != cudaSuccess) {
19867 free(phaseVectors);
19871#elif(VKFFT_BACKEND==2)
19872 res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
19873 if (res != hipSuccess) {
19874 free(phaseVectors);
19878#elif(VKFFT_BACKEND==3)
19879 res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
19880 if (res != CL_SUCCESS) {
19881 free(phaseVectors);
19886#if(VKFFT_BACKEND==0)
19888 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
19890 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
19891 commandBufferAllocateInfo.commandBufferCount = 1;
19892 VkCommandBuffer commandBuffer = {};
19893 res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
19895 free(phaseVectors);
19899 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
19900 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
19901 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
19903 free(phaseVectors);
19912 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19914 free(phaseVectors);
19918 res = vkEndCommandBuffer(commandBuffer);
19920 free(phaseVectors);
19924 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
19925 submitInfo.commandBufferCount = 1;
19926 submitInfo.pCommandBuffers = &commandBuffer;
19929 free(phaseVectors);
19935 free(phaseVectors);
19941 free(phaseVectors);
19947#elif(VKFFT_BACKEND==1)
19951 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19953 free(phaseVectors);
19957 res = cudaDeviceSynchronize();
19958 if (res != cudaSuccess) {
19959 free(phaseVectors);
19963#elif(VKFFT_BACKEND==2)
19967 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19969 free(phaseVectors);
19973 res = hipDeviceSynchronize();
19974 if (res != hipSuccess) {
19975 free(phaseVectors);
19979#elif(VKFFT_BACKEND==3)
19981 launchParams.commandQueue = &commandQueue;
19984 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19986 free(phaseVectors);
19990 res = clFinish(commandQueue);
19991 if (res != CL_SUCCESS) {
19992 free(phaseVectors);
19999 double* phaseVectors_cast = (
double*)phaseVectors;
20001 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20002 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20003 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)cos(angle) : 0;
20004 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)sin(angle) : 0;
20006 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20007 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20008 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20012 float* phaseVectors_cast = (
float*)phaseVectors;
20014 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20015 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20016 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)cos(angle) : 0;
20017 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)sin(angle) : 0;
20019 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20020 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20021 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20024#if(VKFFT_BACKEND==0)
20027 free(phaseVectors);
20031#elif(VKFFT_BACKEND==1)
20032 res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
20033 if (res != cudaSuccess) {
20034 free(phaseVectors);
20038#elif(VKFFT_BACKEND==2)
20039 res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
20040 if (res != hipSuccess) {
20041 free(phaseVectors);
20045#elif(VKFFT_BACKEND==3)
20046 res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
20047 if (res != CL_SUCCESS) {
20048 free(phaseVectors);
20053#if(VKFFT_BACKEND==0)
20055 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
20057 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
20058 commandBufferAllocateInfo.commandBufferCount = 1;
20059 VkCommandBuffer commandBuffer = {};
20060 res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
20062 free(phaseVectors);
20066 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
20067 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
20068 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
20070 free(phaseVectors);
20079 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20081 free(phaseVectors);
20085 res = vkEndCommandBuffer(commandBuffer);
20087 free(phaseVectors);
20091 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20092 submitInfo.commandBufferCount = 1;
20093 submitInfo.pCommandBuffers = &commandBuffer;
20096 free(phaseVectors);
20102 free(phaseVectors);
20108 free(phaseVectors);
20115 VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
20117 commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
20118 commandBufferAllocateInfo.commandBufferCount = 1;
20119 VkCommandBuffer commandBuffer = {};
20120 res = vkAllocateCommandBuffers(kernelPreparationApplication.
configuration.
device[0], &commandBufferAllocateInfo, &commandBuffer);
20122 free(phaseVectors);
20126 VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
20127 commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
20128 res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
20130 free(phaseVectors);
20139 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20141 free(phaseVectors);
20145 res = vkEndCommandBuffer(commandBuffer);
20147 free(phaseVectors);
20151 VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20152 submitInfo.commandBufferCount = 1;
20153 submitInfo.pCommandBuffers = &commandBuffer;
20156 free(phaseVectors);
20162 free(phaseVectors);
20168 free(phaseVectors);
20174#elif(VKFFT_BACKEND==1)
20179 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20181 free(phaseVectors);
20185 res = cudaDeviceSynchronize();
20186 if (res != cudaSuccess) {
20187 free(phaseVectors);
20194 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20196 free(phaseVectors);
20200 res = cudaDeviceSynchronize();
20201 if (res != cudaSuccess) {
20202 free(phaseVectors);
20207#elif(VKFFT_BACKEND==2)
20212 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20214 free(phaseVectors);
20218 res = hipDeviceSynchronize();
20219 if (res != hipSuccess) {
20220 free(phaseVectors);
20227 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20229 free(phaseVectors);
20233 res = hipDeviceSynchronize();
20234 if (res != hipSuccess) {
20235 free(phaseVectors);
20240#elif(VKFFT_BACKEND==3)
20242 launchParams.commandQueue = &commandQueue;
20246 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20248 free(phaseVectors);
20252 res = clFinish(commandQueue);
20253 if (res != CL_SUCCESS) {
20254 free(phaseVectors);
20261 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20263 free(phaseVectors);
20267 res = clFinish(commandQueue);
20268 if (res != CL_SUCCESS) {
20269 free(phaseVectors);
20275 free(phaseVectors);
20276#if(VKFFT_BACKEND==0)
20278#elif(VKFFT_BACKEND==3)
20279 res = clReleaseCommandQueue(commandQueue);
21061#if(VKFFT_BACKEND==0)
21062 VkResult res = VK_SUCCESS;
21063#elif(VKFFT_BACKEND==1)
21064 cudaError_t res = cudaSuccess;
21065#elif(VKFFT_BACKEND==2)
21066 hipError_t res = hipSuccess;
21067#elif(VKFFT_BACKEND==3)
21068 cl_int res = CL_SUCCESS;
21074 axis->specializationConstants.numAxisUploads = FFTPlan->
numAxisUploads[0];
21075 uint64_t complexSize;
21077 complexSize = (2 *
sizeof(double));
21080 complexSize = (2 *
sizeof(float));
21082 complexSize = (2 *
sizeof(float));
21083 axis->specializationConstants.complexSize = complexSize;
21084 axis->specializationConstants.supportAxis = 0;
21089 axis->specializationConstants.dispatchZactualFFTSize = 1;
21092 double double_PI = 3.1415926535897932384626433832795;
21095 double* tempLUT = (
double*)malloc(axis->bufferLUTSize);
21102 tempLUT[2 * i] = (double)cos(angle);
21103 tempLUT[2 * i + 1] = (double)sin(angle);
21105 axis->referenceLUT = 0;
21108#if(VKFFT_BACKEND==0)
21112 axis->referenceLUT = 1;
21115#if(VKFFT_BACKEND==0)
21116 resFFT =
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
21130#elif(VKFFT_BACKEND==1)
21131 res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
21132 if (res != cudaSuccess) {
21138 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
21139 if (res != cudaSuccess) {
21145#elif(VKFFT_BACKEND==2)
21146 res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
21147 if (res != hipSuccess) {
21153 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
21154 if (res != hipSuccess) {
21160#elif(VKFFT_BACKEND==3)
21161 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
21162 if (res != CL_SUCCESS) {
21175 float* tempLUT = (
float*)malloc(axis->bufferLUTSize);
21182 tempLUT[2 * i] = (float)cos(angle);
21183 tempLUT[2 * i + 1] = (float)sin(angle);
21185 axis->referenceLUT = 0;
21188#if(VKFFT_BACKEND==0)
21192 axis->referenceLUT = 1;
21195#if(VKFFT_BACKEND==0)
21196 resFFT =
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
21210#elif(VKFFT_BACKEND==1)
21211 res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
21212 if (res != cudaSuccess) {
21218 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
21219 if (res != cudaSuccess) {
21225#elif(VKFFT_BACKEND==2)
21226 res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
21227 if (res != hipSuccess) {
21233 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
21234 if (res != hipSuccess) {
21240#elif(VKFFT_BACKEND==3)
21241 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
21242 if (res != CL_SUCCESS) {
21255 uint64_t* axisStride = axis->specializationConstants.inputStride;
21256 uint64_t* usedStride = 0;
21269 axisStride[0] = usedStride[0];
21270 axisStride[1] = usedStride[1];
21271 axisStride[2] = usedStride[2];
21272 axisStride[3] = usedStride[3];
21273 axisStride[4] = usedStride[4];
21275 axisStride = axis->specializationConstants.outputStride;
21276 usedStride = axis->specializationConstants.inputStride;
21278 axisStride[0] = usedStride[0];
21279 axisStride[1] = usedStride[1];
21280 axisStride[2] = usedStride[2];
21281 axisStride[3] = usedStride[3];
21282 axisStride[4] = usedStride[4];
21284 axis->specializationConstants.inverse = inverse;
21286 uint64_t storageComplexSize;
21288 storageComplexSize = (2 *
sizeof(double));
21291 storageComplexSize = (2 * 2);
21293 storageComplexSize = (2 *
sizeof(float));
21295 uint64_t initPageSize = -1;
21296 uint64_t locBufferNum = 1;
21297 uint64_t locBufferSize = 0;
21311 uint64_t axis_id = 0;
21312 uint64_t axis_upload_id = 0;
21315 uint64_t totalSize = 0;
21316 uint64_t locPageSize = initPageSize;
21319 ((axis_id == app->
firstAxis) && (!inverse))
21322 uint64_t totalSize = 0;
21323 uint64_t locPageSize = initPageSize;
21332 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21333 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21339 uint64_t totalSize = 0;
21340 uint64_t locPageSize = initPageSize;
21349 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21350 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21355 uint64_t totalSize = 0;
21356 uint64_t locPageSize = initPageSize;
21366 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21367 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21375 ((axis_id == app->
firstAxis) && (inverse))
21380 ((axis_id == app->
firstAxis) && (inverse))
21387 uint64_t totalSize = 0;
21388 uint64_t locPageSize = initPageSize;
21397 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21398 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21403 uint64_t totalSize = 0;
21404 uint64_t locPageSize = initPageSize;
21414 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21415 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21423 locBufferSize = -1;
21427 uint64_t totalSize = 0;
21428 uint64_t locPageSize = initPageSize;
21437 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21438 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21443 uint64_t totalSize = 0;
21444 uint64_t locPageSize = initPageSize;
21454 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21455 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21462 ((axis_id == app->
firstAxis) && (inverse))
21467 ((axis_id == app->
firstAxis) && (inverse))
21474 uint64_t totalSize = 0;
21475 uint64_t locPageSize = initPageSize;
21484 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21485 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21490 uint64_t totalSize = 0;
21491 uint64_t locPageSize = initPageSize;
21501 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21502 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21509 if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
21510 if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
21513 uint64_t totalSize = 0;
21514 uint64_t locPageSize = initPageSize;
21521 axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21522 axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
21524 if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
21527 axis->specializationConstants.kernelBlockSize = 0;
21528 axis->specializationConstants.kernelBlockNum = 0;
21530 axis->numBindings = 2;
21531 axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
21532 axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
21533 axis->specializationConstants.numBuffersBound[2] = 0;
21534 axis->specializationConstants.numBuffersBound[3] = 0;
21536#if(VKFFT_BACKEND==0)
21537 VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
21538 descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]);
21541 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
21542#if(VKFFT_BACKEND==0)
21543 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
21545 axis->numBindings++;
21549 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
21550#if(VKFFT_BACKEND==0)
21551 descriptorPoolSize.descriptorCount++;
21553 axis->numBindings++;
21555#if(VKFFT_BACKEND==0)
21556 VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
21557 descriptorPoolCreateInfo.poolSizeCount = 1;
21558 descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
21559 descriptorPoolCreateInfo.maxSets = 1;
21560 res = vkCreateDescriptorPool(app->
configuration.
device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
21561 if (res != VK_SUCCESS) {
21565 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
21566 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
21567 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings *
sizeof(VkDescriptorSetLayoutBinding));
21568 if (!descriptorSetLayoutBindings) {
21572 for (uint64_t i = 0; i < axis->numBindings; ++i) {
21573 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
21574 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
21575 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
21576 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
21579 VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
21580 descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
21581 descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
21583 res = vkCreateDescriptorSetLayout(app->
configuration.
device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
21584 if (res != VK_SUCCESS) {
21588 free(descriptorSetLayoutBindings);
21589 descriptorSetLayoutBindings = 0;
21590 VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
21591 descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
21592 descriptorSetAllocateInfo.descriptorSetCount = 1;
21593 descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
21594 res = vkAllocateDescriptorSets(app->
configuration.
device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
21595 if (res != VK_SUCCESS) {
21611#if(VKFFT_BACKEND==0)
21612 VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
21613 pipelineLayoutCreateInfo.setLayoutCount = 1;
21614 pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
21616 VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
21617 pushConstantRange.offset = 0;
21620 pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
21621 pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
21623 res = vkCreatePipelineLayout(app->
configuration.
device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
21624 if (res != VK_SUCCESS) {
21629 axis->axisBlock[0] = 128;
21631 axis->axisBlock[1] = 1;
21632 axis->axisBlock[2] = 1;
21638 else axis->specializationConstants.performWorkGroupShift[0] = 0;
21640 else axis->specializationConstants.performWorkGroupShift[1] = 0;
21642 else axis->specializationConstants.performWorkGroupShift[2] = 0;
21644 axis->specializationConstants.localSize[0] = axis->axisBlock[0];
21645 axis->specializationConstants.localSize[1] = axis->axisBlock[1];
21646 axis->specializationConstants.localSize[2] = axis->axisBlock[2];
21663 axis->specializationConstants.axis_id = 0;
21664 axis->specializationConstants.axis_upload_id = 0;
21666 for (uint64_t i = 0; i < 3; i++) {
21679 axis->specializationConstants.zeropad[0] = 0;
21686 axis->specializationConstants.zeropad[1] = 0;
21695 axis->specializationConstants.zeropad[0] = 0;
21702 axis->specializationConstants.zeropad[1] = 0;
21705 axis->specializationConstants.convolutionStep = 1;
21708 axis->specializationConstants.convolutionStep = 0;
21709 char floatTypeInputMemory[10];
21710 char floatTypeOutputMemory[10];
21711 char floatTypeKernelMemory[10];
21712 char floatType[10];
21713 axis->specializationConstants.unroll = 1;
21716 sprintf(floatType,
"double");
21717 sprintf(floatTypeInputMemory,
"double");
21718 sprintf(floatTypeOutputMemory,
"double");
21719 sprintf(floatTypeKernelMemory,
"double");
21725 sprintf(floatType,
"float");
21728 sprintf(floatTypeInputMemory,
"float");
21729 sprintf(floatTypeOutputMemory,
"float");
21730 sprintf(floatTypeKernelMemory,
"float");
21733 sprintf(floatTypeInputMemory,
"half");
21734 sprintf(floatTypeOutputMemory,
"half");
21735 sprintf(floatTypeKernelMemory,
"half");
21741 sprintf(floatType,
"double");
21742 sprintf(floatTypeInputMemory,
"float");
21743 sprintf(floatTypeOutputMemory,
"float");
21744 sprintf(floatTypeKernelMemory,
"float");
21747 sprintf(floatType,
"float");
21748 sprintf(floatTypeInputMemory,
"float");
21749 sprintf(floatTypeOutputMemory,
"float");
21750 sprintf(floatTypeKernelMemory,
"float");
21754 char uintType[20] =
"";
21756#if(VKFFT_BACKEND==0)
21757 sprintf(uintType,
"uint");
21758#elif(VKFFT_BACKEND==1)
21759 sprintf(uintType,
"unsigned int");
21760#elif(VKFFT_BACKEND==2)
21761 sprintf(uintType,
"unsigned int");
21762#elif(VKFFT_BACKEND==3)
21763 sprintf(uintType,
"unsigned int");
21767#if(VKFFT_BACKEND==0)
21768 sprintf(uintType,
"uint64_t");
21769#elif(VKFFT_BACKEND==1)
21770 sprintf(uintType,
"unsigned long long");
21771#elif(VKFFT_BACKEND==2)
21772 sprintf(uintType,
"unsigned long long");
21773#elif(VKFFT_BACKEND==3)
21774 sprintf(uintType,
"unsigned long");
21783 char* code0 = axis->specializationConstants.code0;
21788 resFFT =
shaderGenVkFFT_R2C_decomposition(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
21794#if(VKFFT_BACKEND==0)
21795 const glslang_resource_t default_resource = {
21901 glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
21902 glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
21903 const glslang_input_t input =
21905 GLSLANG_SOURCE_GLSL,
21906 GLSLANG_STAGE_COMPUTE,
21907 GLSLANG_CLIENT_VULKAN,
21909 GLSLANG_TARGET_SPV,
21910 target_language_version,
21913 GLSLANG_NO_PROFILE,
21916 GLSLANG_MSG_DEFAULT_BIT,
21920 glslang_shader_t* shader = glslang_shader_create(&input);
21922 if (!glslang_shader_preprocess(shader, &input))
21924 err = glslang_shader_get_info_log(shader);
21925 printf(
"%s\n", code0);
21926 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21927 glslang_shader_delete(shader);
21935 if (!glslang_shader_parse(shader, &input))
21937 err = glslang_shader_get_info_log(shader);
21938 printf(
"%s\n", code0);
21939 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21940 glslang_shader_delete(shader);
21947 glslang_program_t* program = glslang_program_create();
21948 glslang_program_add_shader(program, shader);
21949 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
21951 err = glslang_program_get_info_log(program);
21952 printf(
"%s\n", code0);
21953 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21954 glslang_shader_delete(shader);
21955 glslang_program_delete(program);
21978 glslang_shader_delete(shader);
21979 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
21980 VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
21981 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
21982 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
21986 res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
21987 if (res != VK_SUCCESS) {
21988 glslang_program_delete(program);
21994 pipelineShaderStageCreateInfo.pName =
"main";
21995 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
21996 computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
21997 computePipelineCreateInfo.layout = axis->pipelineLayout;
21998 res = vkCreateComputePipelines(app->
configuration.
device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
21999 if (res != VK_SUCCESS) {
22003 vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
22004 glslang_program_delete(program);
22005#elif(VKFFT_BACKEND==1)
22007 nvrtcResult result = nvrtcCreateProgram(&prog,
22015 if (result != NVRTC_SUCCESS) {
22016 printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
22025 result = nvrtcCompileProgram(prog,
22028 if (result != NVRTC_SUCCESS) {
22029 printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
22030 char* log = (
char*)malloc(
sizeof(
char) * 1000000);
22038 nvrtcGetProgramLog(prog, log);
22039 printf(
"%s\n", log);
22042 printf(
"%s\n", code0);
22050 result = nvrtcGetPTXSize(prog, &ptxSize);
22051 if (result != NVRTC_SUCCESS) {
22052 printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
22058 char* ptx = (
char*)malloc(ptxSize);
22065 result = nvrtcGetPTX(prog, ptx);
22066 if (result != NVRTC_SUCCESS) {
22067 printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
22075 result = nvrtcDestroyProgram(&prog);
22076 if (result != NVRTC_SUCCESS) {
22077 printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
22086 CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
22088 if (result2 != CUDA_SUCCESS) {
22089 printf(
"cuModuleLoadDataEx error: %d\n", result2);
22097 result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule,
"VkFFT_main_R2C");
22098 if (result2 != CUDA_SUCCESS) {
22099 printf(
"cuModuleGetFunction error: %d\n", result2);
22108 result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)axis->specializationConstants.usedSharedMemory);
22109 if (result2 != CUDA_SUCCESS) {
22110 printf(
"cuFuncSetAttribute error: %d\n", result2);
22120 result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule,
"consts");
22121 if (result2 != CUDA_SUCCESS) {
22122 printf(
"cuModuleGetGlobal error: %d\n", result2);
22132#elif(VKFFT_BACKEND==2)
22133 hiprtcProgram prog;
22138 enum hiprtcResult result = hiprtcCreateProgram(&prog,
22144 if (result != HIPRTC_SUCCESS) {
22145 printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
22152 result = hiprtcAddNameExpression(prog,
"&consts");
22153 if (result != HIPRTC_SUCCESS) {
22154 printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
22161 result = hiprtcCompileProgram(prog,
22164 if (result != HIPRTC_SUCCESS) {
22165 printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
22166 char* log = (
char*)malloc(
sizeof(
char) * 100000);
22174 hiprtcGetProgramLog(prog, log);
22175 printf(
"%s\n", log);
22178 printf(
"%s\n", code0);
22186 result = hiprtcGetCodeSize(prog, &codeSize);
22187 if (result != HIPRTC_SUCCESS) {
22188 printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
22194 char* code = (
char*)malloc(codeSize);
22201 result = hiprtcGetCode(prog, code);
22202 if (result != HIPRTC_SUCCESS) {
22203 printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
22213 result = hiprtcDestroyProgram(&prog);
22214 if (result != HIPRTC_SUCCESS) {
22215 printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
22223 hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
22225 if (result2 != hipSuccess) {
22226 printf(
"hipModuleLoadDataEx error: %d\n", result2);
22234 result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule,
"VkFFT_main_R2C");
22235 if (result2 != hipSuccess) {
22236 printf(
"hipModuleGetFunction error: %d\n", result2);
22245 result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)axis->specializationConstants.usedSharedMemory);
22247 if (result2 != hipSuccess) {
22248 printf(
"hipFuncSetAttribute error: %d\n", result2);
22258 result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule,
"consts");
22259 if (result2 != hipSuccess) {
22260 printf(
"hipModuleGetGlobal error: %d\n", result2);
22271#elif(VKFFT_BACKEND==3)
22272 size_t codelen = strlen(code0);
22273 axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
22274 if (res != CL_SUCCESS) {
22281 if (res != CL_SUCCESS) {
22283 clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
22284 char* log = (
char*)malloc(log_size);
22292 clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
22293 printf(
"%s\n", log);
22296 printf(
"%s\n", code0);
22303 axis->kernel = clCreateKernel(axis->program,
"VkFFT_main_R2C", &res);
22304 if (res != CL_SUCCESS) {
22314 axis->specializationConstants.code0 = 0;
22322#if(VKFFT_BACKEND==0)
22323 VkResult res = VK_SUCCESS;
22324#elif(VKFFT_BACKEND==1)
22325 cudaError_t res = cudaSuccess;
22326#elif(VKFFT_BACKEND==2)
22327 hipError_t res = hipSuccess;
22328#elif(VKFFT_BACKEND==3)
22329 cl_int res = CL_SUCCESS;
22333 axis->specializationConstants.sourceFFTSize = app->
configuration.
size[axis_id];
22346 axis->specializationConstants.numAxisUploads = FFTPlan->
numAxisUploads[axis_id];
22347 uint64_t complexSize;
22349 complexSize = (2 *
sizeof(double));
22352 complexSize = (2 *
sizeof(float));
22354 complexSize = (2 *
sizeof(float));
22355 axis->specializationConstants.complexSize = complexSize;
22356 axis->specializationConstants.supportAxis = 0;
22366 axis->specializationConstants.stageStartSize = 1;
22367 for (uint64_t i = 0; i < axis_upload_id; i++)
22368 axis->specializationConstants.stageStartSize *= FFTPlan->
axisSplit[axis_id][i];
22373 if (axis_id == 0) {
22375 axis->specializationConstants.fft_dim_x = axis->specializationConstants.stageStartSize;
22381 axis->specializationConstants.useBluesteinFFT = 1;
22385 axis->specializationConstants.actualInverse = inverse;
22386 axis->specializationConstants.inverse = !inverse;
22390 axis->specializationConstants.actualInverse = inverse;
22391 axis->specializationConstants.inverse = 1;
22394 axis->specializationConstants.actualInverse = inverse;
22395 axis->specializationConstants.inverse = inverse;
22399 axis->specializationConstants.actualInverse = inverse;
22400 axis->specializationConstants.inverse = reverseBluesteinMultiUpload;
22402 axis->specializationConstants.inverseBluestein = !inverse;
22406 axis->specializationConstants.inverseBluestein = 1;
22409 axis->specializationConstants.inverseBluestein = inverse;
22413 axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload;
22417 if ((axis_id == 0) && ((FFTPlan->
numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) {
22418 maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost;
22419 maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory));
22422 maxSingleSizeStrided *= axis->specializationConstants.registerBoost;
22423 maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided));
22427 axis->specializationConstants.performR2CmultiUpload = FFTPlan->
multiUploadR2C;
22429 axis->specializationConstants.performDCT = 2;
22441#if(VKFFT_BACKEND==0)
22452#elif(VKFFT_BACKEND==1)
22459 if (res != cudaSuccess) {
22463#elif(VKFFT_BACKEND==2)
22470 if (res != hipSuccess) {
22474#elif(VKFFT_BACKEND==3)
22481 if (res != CL_SUCCESS) {
22489 double double_PI = 3.1415926535897932384626433832795;
22490 uint64_t dimMult = 1;
22491 uint64_t maxStageSum = 0;
22492 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22493 switch (axis->specializationConstants.stageRadix[i]) {
22495 maxStageSum += dimMult;
22498 maxStageSum += dimMult * 2;
22501 maxStageSum += dimMult * 2;
22504 maxStageSum += dimMult * 4;
22507 maxStageSum += dimMult * 6;
22510 maxStageSum += dimMult * 3;
22513 maxStageSum += dimMult * 10;
22516 maxStageSum += dimMult * 12;
22519 dimMult *= axis->specializationConstants.stageRadix[i];
22521 axis->specializationConstants.maxStageSumLUT = maxStageSum;
22524 if (axis_upload_id > 0) {
22526 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22527 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(
double);
22531 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22532 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22533 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 *
sizeof(
double);
22536 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 *
sizeof(
double);
22541 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22542 axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(
double);
22546 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22547 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22552 axis->bufferLUTSize = (maxStageSum) * 2 *
sizeof(
double);
22555 double* tempLUT = (
double*)malloc(axis->bufferLUTSize);
22560 uint64_t localStageSize = 1;
22561 uint64_t localStageSum = 0;
22562 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22563 if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22564 for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
22565 for (uint64_t j = 0; j < localStageSize; j++) {
22566 tempLUT[2 * (j + localStageSum)] = cos(j * double_PI / localStageSize / pow(2, k));
22567 tempLUT[2 * (j + localStageSum) + 1] = sin(j * double_PI / localStageSize / pow(2, k));
22569 localStageSum += localStageSize;
22571 localStageSize *= axis->specializationConstants.stageRadix[i];
22574 for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22575 for (uint64_t j = 0; j < localStageSize; j++) {
22576 tempLUT[2 * (j + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22577 tempLUT[2 * (j + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22579 localStageSum += localStageSize;
22581 localStageSize *= axis->specializationConstants.stageRadix[i];
22585 if (axis_upload_id > 0) {
22586 for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
22587 for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
22588 double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
22589 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = cos(angle);
22590 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = sin(angle);
22596 double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id])) * j;
22597 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
22598 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
22603 double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id] / 2)) * j;
22604 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle);
22605 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle);
22608 double angle = (-double_PI / 8.0 / (double)(app->
configuration.
size[axis_id] / 2)) * (2 * j + 1);
22609 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = cos(angle);
22610 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = sin(angle);
22613 axis->referenceLUT = 0;
22614 if (reverseBluesteinMultiUpload == 1) {
22615 axis->bufferLUT = FFTPlan->
axes[axis_id][axis_upload_id].
bufferLUT;
22616#if(VKFFT_BACKEND==0)
22620 axis->referenceLUT = 1;
22625#if(VKFFT_BACKEND==0)
22629 axis->referenceLUT = 1;
22633 axis->bufferLUT = FFTPlan->
axes[0][axis_upload_id].
bufferLUT;
22634#if(VKFFT_BACKEND==0)
22638 axis->referenceLUT = 1;
22642 axis->bufferLUT = FFTPlan->
axes[1][axis_upload_id].
bufferLUT;
22643#if(VKFFT_BACKEND==0)
22647 axis->referenceLUT = 1;
22650#if(VKFFT_BACKEND==0)
22651 resFFT =
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
22665#elif(VKFFT_BACKEND==1)
22666 res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
22667 if (res != cudaSuccess) {
22673 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
22674 if (res != cudaSuccess) {
22680#elif(VKFFT_BACKEND==2)
22681 res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
22682 if (res != hipSuccess) {
22688 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
22689 if (res != hipSuccess) {
22695#elif(VKFFT_BACKEND==3)
22696 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
22697 if (res != CL_SUCCESS) {
22712 if (axis_upload_id > 0) {
22714 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22715 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(
float);
22719 axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim);
22720 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (axis->specializationConstants.fftDim / 4 + 2));
22721 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 *
sizeof(
float);
22724 axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 *
sizeof(
float);
22729 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22730 axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(
float);
22734 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22735 axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22739 axis->bufferLUTSize = (maxStageSum) * 2 *
sizeof(
float);
22742 float* tempLUT = (
float*)malloc(axis->bufferLUTSize);
22747 uint64_t localStageSize = 1;
22748 uint64_t localStageSum = 0;
22749 for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) {
22750 if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22751 for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
22752 for (uint64_t j = 0; j < localStageSize; j++) {
22753 tempLUT[2 * (j + localStageSum)] = (
float)cos(j * double_PI / localStageSize / pow(2, k));
22754 tempLUT[2 * (j + localStageSum) + 1] = (
float)sin(j * double_PI / localStageSize / pow(2, k));
22756 localStageSum += localStageSize;
22758 localStageSize *= axis->specializationConstants.stageRadix[i];
22761 for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22762 for (uint64_t j = 0; j < localStageSize; j++) {
22763 tempLUT[2 * (j + localStageSum)] = (
float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22764 tempLUT[2 * (j + localStageSum) + 1] = (
float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22766 localStageSum += localStageSize;
22768 localStageSize *= axis->specializationConstants.stageRadix[i];
22772 if (axis_upload_id > 0) {
22773 for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
22774 for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) {
22775 double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
22776 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (
float)cos(angle);
22777 tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (
float)sin(angle);
22783 double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id])) * j;
22784 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
22785 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
22790 double angle = (double_PI / 2.0 / (double)(app->
configuration.
size[axis_id] / 2)) * j;
22791 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle);
22792 tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle);
22795 double angle = (-double_PI / 8.0 / (double)(app->
configuration.
size[axis_id] / 2)) * (2 * j + 1);
22796 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (float)cos(angle);
22797 tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)sin(angle);
22800 axis->referenceLUT = 0;
22801 if (reverseBluesteinMultiUpload == 1) {
22802 axis->bufferLUT = FFTPlan->
axes[axis_id][axis_upload_id].
bufferLUT;
22803#if(VKFFT_BACKEND==0)
22807 axis->referenceLUT = 1;
22812#if(VKFFT_BACKEND==0)
22816 axis->referenceLUT = 1;
22820 axis->bufferLUT = FFTPlan->
axes[0][axis_upload_id].
bufferLUT;
22821#if(VKFFT_BACKEND==0)
22825 axis->referenceLUT = 1;
22829 axis->bufferLUT = FFTPlan->
axes[1][axis_upload_id].
bufferLUT;
22830#if(VKFFT_BACKEND==0)
22834 axis->referenceLUT = 1;
22837#if(VKFFT_BACKEND==0)
22838 resFFT =
allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
22852#elif(VKFFT_BACKEND==1)
22853 res = cudaMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
22854 if (res != cudaSuccess) {
22860 res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
22861 if (res != cudaSuccess) {
22867#elif(VKFFT_BACKEND==2)
22868 res = hipMalloc((
void**)&axis->bufferLUT, axis->bufferLUTSize);
22869 if (res != hipSuccess) {
22875 res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
22876 if (res != hipSuccess) {
22882#elif(VKFFT_BACKEND==3)
22883 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res);
22884 if (res != CL_SUCCESS) {
22902 uint64_t* axisStride = axis->specializationConstants.inputStride;
22909 if (axis_id == 0) {
22910 axisStride[1] = usedStride[0];
22911 axisStride[2] = usedStride[1];
22915 axisStride[1] = usedStride[0];
22916 axisStride[2] = usedStride[1];
22920 axisStride[1] = usedStride[1];
22921 axisStride[2] = usedStride[0];
22924 axisStride[3] = usedStride[2];
22930 if (axis_id == 0) {
22949 if ((!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0) && (axis->specializationConstants.performR2C) && (!(app->
configuration.
isInputFormatted))) {
22950 axisStride[1] *= 2;
22951 axisStride[2] *= 2;
22952 axisStride[3] *= 2;
22953 axisStride[4] *= 2;
22955 if ((FFTPlan->
multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) {
22956 for (uint64_t i = 1; i < 5; i++) {
22957 axisStride[i] /= 2;
22960 axisStride = axis->specializationConstants.outputStride;
22968 if (axis_id == 0) {
22969 axisStride[1] = usedStride[0];
22970 axisStride[2] = usedStride[1];
22974 axisStride[1] = usedStride[0];
22975 axisStride[2] = usedStride[1];
22979 axisStride[1] = usedStride[1];
22980 axisStride[2] = usedStride[0];
22983 axisStride[3] = usedStride[2];
22989 if (axis_id == 0) {
23009 axisStride[1] *= 2;
23010 axisStride[2] *= 2;
23011 axisStride[3] *= 2;
23012 axisStride[4] *= 2;
23015 for (uint64_t i = 1; i < 5; i++) {
23016 axisStride[i] /= 2;
23028 uint64_t storageComplexSize;
23030 storageComplexSize = (2 *
sizeof(double));
23033 storageComplexSize = (2 * 2);
23035 storageComplexSize = (2 *
sizeof(float));
23037 uint64_t initPageSize = -1;
23038 uint64_t locBufferNum = 1;
23039 uint64_t locBufferSize = -1;
23067 ((axis_id == app->
firstAxis) && (!inverse))
23070 uint64_t totalSize = 0;
23071 uint64_t locPageSize = initPageSize;
23080 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23081 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23087 uint64_t totalSize = 0;
23088 uint64_t locPageSize = initPageSize;
23097 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23098 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23103 uint64_t totalSize = 0;
23104 uint64_t locPageSize = initPageSize;
23106 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->
useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1))) {
23141 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23142 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23149 locBufferSize = -1;
23151 ((axis_id == app->
firstAxis) && (inverse))
23156 ((axis_id == app->
firstAxis) && (inverse))
23163 uint64_t totalSize = 0;
23164 uint64_t locPageSize = initPageSize;
23173 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23174 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23179 uint64_t totalSize = 0;
23180 uint64_t locPageSize = initPageSize;
23182 if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->
useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
23213 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23214 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23218 if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1;
23219 if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1;
23221 uint64_t totalSize = 0;
23222 uint64_t locPageSize = initPageSize;
23231 axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23232 axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
23234 if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1;
23237 axis->specializationConstants.kernelBlockSize = 0;
23238 axis->specializationConstants.kernelBlockNum = 0;
23240 axis->numBindings = 2;
23241 axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum;
23242 axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum;
23243 axis->specializationConstants.numBuffersBound[2] = 0;
23244 axis->specializationConstants.numBuffersBound[3] = 0;
23245#if(VKFFT_BACKEND==0)
23246 VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
23247 descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum);
23249 axis->specializationConstants.convolutionBindingID = -1;
23251 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23252 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23253#if(VKFFT_BACKEND==0)
23254 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23256 axis->numBindings++;
23259 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23260 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23261#if(VKFFT_BACKEND==0)
23262 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23264 axis->numBindings++;
23267 axis->specializationConstants.convolutionBindingID = axis->numBindings;
23268 axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
23269#if(VKFFT_BACKEND==0)
23270 descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum;
23272 axis->numBindings++;
23275 axis->specializationConstants.LUTBindingID = axis->numBindings;
23276 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23277#if(VKFFT_BACKEND==0)
23278 descriptorPoolSize.descriptorCount++;
23280 axis->numBindings++;
23283 if (axis->specializationConstants.inverseBluestein)
23287 axis->specializationConstants.BluesteinConvolutionBindingID = axis->numBindings;
23288 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23289#if(VKFFT_BACKEND==0)
23290 descriptorPoolSize.descriptorCount++;
23292 axis->numBindings++;
23296 axis->specializationConstants.BluesteinMultiplicationBindingID = axis->numBindings;
23297 axis->specializationConstants.numBuffersBound[axis->numBindings] = 1;
23298#if(VKFFT_BACKEND==0)
23299 descriptorPoolSize.descriptorCount++;
23301 axis->numBindings++;
23303#if(VKFFT_BACKEND==0)
23304 VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
23305 descriptorPoolCreateInfo.poolSizeCount = 1;
23306 descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
23307 descriptorPoolCreateInfo.maxSets = 1;
23308 res = vkCreateDescriptorPool(app->
configuration.
device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool);
23309 if (res != VK_SUCCESS) {
23313 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
23314 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
23315 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings *
sizeof(VkDescriptorSetLayoutBinding));
23316 if (!descriptorSetLayoutBindings) {
23320 for (uint64_t i = 0; i < axis->numBindings; ++i) {
23321 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
23322 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
23323 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i];
23324 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
23327 VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
23328 descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings;
23329 descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
23331 res = vkCreateDescriptorSetLayout(app->
configuration.
device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout);
23332 if (res != VK_SUCCESS) {
23336 free(descriptorSetLayoutBindings);
23337 descriptorSetLayoutBindings = 0;
23338 VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
23339 descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
23340 descriptorSetAllocateInfo.descriptorSetCount = 1;
23341 descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
23342 res = vkAllocateDescriptorSets(app->
configuration.
device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
23343 if (res != VK_SUCCESS) {
23359#if(VKFFT_BACKEND==0)
23360 VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
23361 pipelineLayoutCreateInfo.setLayoutCount = 1;
23362 pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
23364 VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
23365 pushConstantRange.offset = 0;
23368 pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
23369 pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
23371 res = vkCreatePipelineLayout(app->
configuration.
device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout);
23372 if (res != VK_SUCCESS) {
23378 axis->groupedBatch = maxBatchCoalesced;
23394 if (((FFTPlan->
numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) {
23395 axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim : axis->groupedBatch;
23398 axis->groupedBatch = (maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch;
23403 if ((FFTPlan->
numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
23404 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23407 if ((FFTPlan->
numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) {
23408 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23410 if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced;
23411 axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
23413 if (!((axis_id == 0) && (FFTPlan->
numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && (axis->specializationConstants.fftDim > maxSingleSizeStrided)) {
23414 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23418 axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0);
23420 if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23421 if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23422 uint64_t maxThreadNum = maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
23424 axis->specializationConstants.axisSwapped = 0;
23425 uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
23426 if (axis_id == 0) {
23428 if (axis_upload_id == 0) {
23429 axis->axisBlock[0] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23430 if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum;
23432 if (axis->specializationConstants.reorderFourStep && (FFTPlan->
numAxisUploads[axis_id] > 1))
23433 axis->axisBlock[1] = axis->groupedBatch;
23438 uint64_t currentAxisBlock1 = axis->axisBlock[1];
23439 for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
23441 if (i * axis->specializationConstants.fftDim * complexSize <= app->configuration.sharedMemorySize) axis->axisBlock[1] = i;
23442 i = 2 * currentAxisBlock1;
23446 if ((FFTPlan->
numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim);
23447 if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) {
23448 axis->specializationConstants.mergeSequencesR2C = 0;
23467 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23468 for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) {
23469 if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum)
23471 axis->axisBlock[1] /= i;
23472 i = axis->axisBlock[1] + 1;
23477 while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2;
23479#if (VKFFT_BACKEND==0)
23480 if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) {
23481 uint64_t temp = axis->axisBlock[1];
23482 axis->axisBlock[1] = axis->axisBlock[0];
23483 axis->axisBlock[0] = temp;
23484 axis->specializationConstants.axisSwapped = 1;
23487 uint64_t temp = axis->axisBlock[1];
23488 axis->axisBlock[1] = axis->axisBlock[0];
23489 axis->axisBlock[0] = temp;
23490 axis->specializationConstants.axisSwapped = 1;
23493 axis->axisBlock[2] = 1;
23494 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23497 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23499 if (scale > 1) axis->groupedBatch *= scale;
23500 axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize;
23502 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23503 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23504 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23506 axis->axisBlock[0] /= i;
23507 i = axis->axisBlock[0] + 1;
23512 axis->axisBlock[2] = 1;
23513 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23517 if (axis_id == 1) {
23519 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23523 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23524 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23525 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23527 axis->axisBlock[0] /= i;
23528 i = axis->axisBlock[0] + 1;
23533 axis->axisBlock[2] = 1;
23534 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23537 if (axis_id == 2) {
23538 axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
23543 if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) {
23544 for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) {
23545 if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum)
23547 axis->axisBlock[0] /= i;
23548 i = axis->axisBlock[0] + 1;
23553 axis->axisBlock[2] = 1;
23554 axis->axisBlock[3] = axis->specializationConstants.fftDim;
23569 axis->specializationConstants.localSize[0] = axis->axisBlock[0];
23570 axis->specializationConstants.localSize[1] = axis->axisBlock[1];
23571 axis->specializationConstants.localSize[2] = axis->axisBlock[2];
23580 axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : app->
configuration.
normalize;
23584 axis->specializationConstants.axis_id = axis_id;
23585 axis->specializationConstants.axis_upload_id = axis_upload_id;
23587 for (uint64_t i = 0; i < 3; i++) {
23593 if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
23594 axis->specializationConstants.zeropadBluestein[0] = 1;
23595 axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->
configuration.
size[axis_id];
23596 if (FFTPlan->
multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2;
23597 if (app->
configuration.
performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id]-2;
23599 axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
23601 if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
23602 axis->specializationConstants.zeropadBluestein[1] = 1;
23603 axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->
configuration.
size[axis_id];
23604 if (FFTPlan->
multiUploadR2C) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2;
23605 if (app->
configuration.
performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2;
23607 axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
23610 if ((app->
configuration.
frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)))) {
23616 axis->specializationConstants.zeropad[0] = 0;
23617 if ((!app->
configuration.
frequencyZeroPadding) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0)))) {
23623 axis->specializationConstants.zeropad[1] = 0;
23632 axis->specializationConstants.zeropad[0] = 0;
23639 axis->specializationConstants.zeropad[1] = 0;
23642 axis->specializationConstants.convolutionStep = 1;
23645 axis->specializationConstants.convolutionStep = 0;
23647 axis->specializationConstants.BluesteinConvolutionStep = 1;
23649 axis->specializationConstants.BluesteinConvolutionStep = 0;
23652 axis->specializationConstants.BluesteinPreMultiplication = 1;
23654 axis->specializationConstants.BluesteinPreMultiplication = 0;
23656 axis->specializationConstants.BluesteinPostMultiplication = 1;
23658 axis->specializationConstants.BluesteinPostMultiplication = 0;
23664 if (axis_id == 0) {
23665 if (axis_upload_id == 0)
23666 tempSize[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[1];
23668 tempSize[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[0];
23669 if ((FFTPlan->
actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
23675 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23677 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23679 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23681 if (axis_id == 1) {
23691 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23693 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23695 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23698 if (axis_id == 2) {
23707 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23709 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23711 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23715 char floatTypeInputMemory[10];
23716 char floatTypeOutputMemory[10];
23717 char floatTypeKernelMemory[10];
23718 char floatType[10];
23719 axis->specializationConstants.unroll = 1;
23722 sprintf(floatType,
"double");
23723 sprintf(floatTypeInputMemory,
"double");
23724 sprintf(floatTypeOutputMemory,
"double");
23725 sprintf(floatTypeKernelMemory,
"double");
23731 sprintf(floatType,
"float");
23734 sprintf(floatTypeKernelMemory,
"float");
23735 if ((axis_id == app->
firstAxis) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.actualInverse))
23736 sprintf(floatTypeInputMemory,
"half");
23738 sprintf(floatTypeInputMemory,
"float");
23739 if ((axis_id == app->
firstAxis) && (((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)) || ((axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.actualInverse))
23740 sprintf(floatTypeOutputMemory,
"half");
23742 sprintf(floatTypeOutputMemory,
"float");
23745 sprintf(floatTypeInputMemory,
"half");
23746 sprintf(floatTypeOutputMemory,
"half");
23747 sprintf(floatTypeKernelMemory,
"half");
23753 sprintf(floatType,
"double");
23754 sprintf(floatTypeInputMemory,
"float");
23755 sprintf(floatTypeOutputMemory,
"float");
23756 sprintf(floatTypeKernelMemory,
"float");
23759 sprintf(floatType,
"float");
23760 sprintf(floatTypeInputMemory,
"float");
23761 sprintf(floatTypeOutputMemory,
"float");
23762 sprintf(floatTypeKernelMemory,
"float");
23766 char uintType[20] =
"";
23768#if(VKFFT_BACKEND==0)
23769 sprintf(uintType,
"uint");
23770#elif(VKFFT_BACKEND==1)
23771 sprintf(uintType,
"unsigned int");
23772#elif(VKFFT_BACKEND==2)
23773 sprintf(uintType,
"unsigned int");
23774#elif(VKFFT_BACKEND==3)
23775 sprintf(uintType,
"unsigned int");
23779#if(VKFFT_BACKEND==0)
23780 sprintf(uintType,
"uint64_t");
23781#elif(VKFFT_BACKEND==1)
23782 sprintf(uintType,
"unsigned long long");
23783#elif(VKFFT_BACKEND==2)
23784 sprintf(uintType,
"unsigned long long");
23785#elif(VKFFT_BACKEND==3)
23786 sprintf(uintType,
"unsigned long");
23792 if ((axis_id == 0) && (axis_upload_id == 0)) type = 0;
23793 if (axis_id != 0) type = 1;
23794 if ((axis_id == 0) && (axis_upload_id > 0)) type = 2;
23796 if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->
actualPerformR2CPerAxis[axis_id])) type = 5;
23797 if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->
actualPerformR2CPerAxis[axis_id])) type = 6;
23808#if(VKFFT_BACKEND==0)
23810#elif(VKFFT_BACKEND==1)
23811 axis->specializationConstants.cacheShuffle = 0;
23812#elif(VKFFT_BACKEND==2)
23813 axis->specializationConstants.cacheShuffle = 0;
23814#elif(VKFFT_BACKEND==3)
23815 axis->specializationConstants.cacheShuffle = 0;
23821 char* code0 = axis->specializationConstants.code0;
23826 resFFT =
shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
23832#if(VKFFT_BACKEND==0)
23833 const glslang_resource_t default_resource = {
23939 glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
23940 glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
23941 const glslang_input_t input =
23943 GLSLANG_SOURCE_GLSL,
23944 GLSLANG_STAGE_COMPUTE,
23945 GLSLANG_CLIENT_VULKAN,
23947 GLSLANG_TARGET_SPV,
23948 target_language_version,
23951 GLSLANG_NO_PROFILE,
23954 GLSLANG_MSG_DEFAULT_BIT,
23958 glslang_shader_t* shader = glslang_shader_create(&input);
23960 if (!glslang_shader_preprocess(shader, &input))
23962 err = glslang_shader_get_info_log(shader);
23963 printf(
"%s\n", code0);
23964 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23965 glslang_shader_delete(shader);
23973 if (!glslang_shader_parse(shader, &input))
23975 err = glslang_shader_get_info_log(shader);
23976 printf(
"%s\n", code0);
23977 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23978 glslang_shader_delete(shader);
23985 glslang_program_t* program = glslang_program_create();
23986 glslang_program_add_shader(program, shader);
23987 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
23989 err = glslang_program_get_info_log(program);
23990 printf(
"%s\n", code0);
23991 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23992 glslang_shader_delete(shader);
23993 glslang_program_delete(program);
24016 glslang_shader_delete(shader);
24017 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
24018 VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
24019 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
24020 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
24024 res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
24025 if (res != VK_SUCCESS) {
24026 glslang_program_delete(program);
24032 pipelineShaderStageCreateInfo.pName =
"main";
24033 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
24034 computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
24035 computePipelineCreateInfo.layout = axis->pipelineLayout;
24036 res = vkCreateComputePipelines(app->
configuration.
device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline);
24037 if (res != VK_SUCCESS) {
24041 vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
24042 glslang_program_delete(program);
24043#elif(VKFFT_BACKEND==1)
24045 nvrtcResult result = nvrtcCreateProgram(&prog,
24053 if (result != NVRTC_SUCCESS) {
24054 printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
24063 result = nvrtcCompileProgram(prog,
24066 if (result != NVRTC_SUCCESS) {
24067 printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
24068 char* log = (
char*)malloc(
sizeof(
char) * 1000000);
24076 nvrtcGetProgramLog(prog, log);
24077 printf(
"%s\n", log);
24080 printf(
"%s\n", code0);
24088 result = nvrtcGetPTXSize(prog, &ptxSize);
24089 if (result != NVRTC_SUCCESS) {
24090 printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
24096 char* ptx = (
char*)malloc(ptxSize);
24103 result = nvrtcGetPTX(prog, ptx);
24104 if (result != NVRTC_SUCCESS) {
24105 printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
24113 result = nvrtcDestroyProgram(&prog);
24114 if (result != NVRTC_SUCCESS) {
24115 printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
24124 CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
24126 if (result2 != CUDA_SUCCESS) {
24127 printf(
"cuModuleLoadDataEx error: %d\n", result2);
24135 result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule,
"VkFFT_main");
24136 if (result2 != CUDA_SUCCESS) {
24137 printf(
"cuModuleGetFunction error: %d\n", result2);
24146 result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)axis->specializationConstants.usedSharedMemory);
24147 if (result2 != CUDA_SUCCESS) {
24148 printf(
"cuFuncSetAttribute error: %d\n", result2);
24158 result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule,
"consts");
24159 if (result2 != CUDA_SUCCESS) {
24160 printf(
"cuModuleGetGlobal error: %d\n", result2);
24170#elif(VKFFT_BACKEND==2)
24171 hiprtcProgram prog;
24176 enum hiprtcResult result = hiprtcCreateProgram(&prog,
24182 if (result != HIPRTC_SUCCESS) {
24183 printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
24190 result = hiprtcAddNameExpression(prog,
"&consts");
24191 if (result != HIPRTC_SUCCESS) {
24192 printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
24199 result = hiprtcCompileProgram(prog,
24202 if (result != HIPRTC_SUCCESS) {
24203 printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
24204 char* log = (
char*)malloc(
sizeof(
char) * 100000);
24212 hiprtcGetProgramLog(prog, log);
24213 printf(
"%s\n", log);
24216 printf(
"%s\n", code0);
24224 result = hiprtcGetCodeSize(prog, &codeSize);
24225 if (result != HIPRTC_SUCCESS) {
24226 printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
24232 char* code = (
char*)malloc(codeSize);
24239 result = hiprtcGetCode(prog, code);
24240 if (result != HIPRTC_SUCCESS) {
24241 printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
24251 result = hiprtcDestroyProgram(&prog);
24252 if (result != HIPRTC_SUCCESS) {
24253 printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
24261 hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
24263 if (result2 != hipSuccess) {
24264 printf(
"hipModuleLoadDataEx error: %d\n", result2);
24272 result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule,
"VkFFT_main");
24273 if (result2 != hipSuccess) {
24274 printf(
"hipModuleGetFunction error: %d\n", result2);
24283 result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)axis->specializationConstants.usedSharedMemory);
24285 if (result2 != hipSuccess) {
24286 printf(
"hipFuncSetAttribute error: %d\n", result2);
24296 result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule,
"consts");
24297 if (result2 != hipSuccess) {
24298 printf(
"hipModuleGetGlobal error: %d\n", result2);
24309#elif(VKFFT_BACKEND==3)
24310 size_t codelen = strlen(code0);
24311 axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
24312 if (res != CL_SUCCESS) {
24319 if (res != CL_SUCCESS) {
24321 clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size);
24322 char* log = (
char*)malloc(log_size);
24330 clGetProgramBuildInfo(axis->program, app->
configuration.
device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0);
24331 printf(
"%s\n", log);
24334 printf(
"%s\n", code0);
24341 axis->kernel = clCreateKernel(axis->program,
"VkFFT_main", &res);
24342 if (res != CL_SUCCESS) {
24352 axis->specializationConstants.code0 = 0;
24355 if (axis->specializationConstants.axisSwapped) {
24356 uint64_t temp = axis->axisBlock[1];
24357 axis->axisBlock[1] = axis->axisBlock[0];
24358 axis->axisBlock[0] = temp;
24359 axis->specializationConstants.axisSwapped = 0;
24370#if(VKFFT_BACKEND==0)
24373 int resGlslangInitialize = glslang_initialize_process();
24383 if (inputLaunchConfiguration.
device == 0) {
24388 if (inputLaunchConfiguration.
queue == 0) {
24398 if (inputLaunchConfiguration.
fence == 0) {
24404 VkPhysicalDeviceProperties physicalDeviceProperties = { 0 };
24417 switch (physicalDeviceProperties.vendorID) {
24457#elif(VKFFT_BACKEND==1)
24458 CUresult res = CUDA_SUCCESS;
24459 cudaError_t res_t = cudaSuccess;
24460 if (inputLaunchConfiguration.
device == 0) {
24465 if (inputLaunchConfiguration.num_streams != 0) app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
24466 if (inputLaunchConfiguration.stream != 0) app->
configuration.stream = inputLaunchConfiguration.stream;
24469 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->
configuration.
device[0]);
24470 if (res != CUDA_SUCCESS) {
24475 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->
configuration.
device[0]);
24476 if (res != CUDA_SUCCESS) {
24481 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->
configuration.
device[0]);
24482 if (res != CUDA_SUCCESS) {
24487 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->
configuration.
device[0]);
24488 if (res != CUDA_SUCCESS) {
24493 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->
configuration.
device[0]);
24494 if (res != CUDA_SUCCESS) {
24499 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->
configuration.
device[0]);
24500 if (res != CUDA_SUCCESS) {
24505 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->
configuration.
device[0]);
24506 if (res != CUDA_SUCCESS) {
24511 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->
configuration.
device[0]);
24512 if (res != CUDA_SUCCESS) {
24517 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->
configuration.
device[0]);
24518 if (res != CUDA_SUCCESS) {
24523 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->
configuration.
device[0]);
24524 if (res != CUDA_SUCCESS) {
24536 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
24537 res_t = cudaEventCreate(&app->
configuration.stream_event[i]);
24538 if (res != CUDA_SUCCESS) {
24551#elif(VKFFT_BACKEND==2)
24552 hipError_t res = hipSuccess;
24553 if (inputLaunchConfiguration.
device == 0) {
24558 if (inputLaunchConfiguration.num_streams != 0) app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
24559 if (inputLaunchConfiguration.stream != 0) app->
configuration.stream = inputLaunchConfiguration.stream;
24562 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->
configuration.
device[0]);
24563 if (res != hipSuccess) {
24568 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->
configuration.
device[0]);
24569 if (res != hipSuccess) {
24574 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->
configuration.
device[0]);
24575 if (res != hipSuccess) {
24580 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->
configuration.
device[0]);
24581 if (res != hipSuccess) {
24586 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->
configuration.
device[0]);
24587 if (res != hipSuccess) {
24592 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->
configuration.
device[0]);
24593 if (res != hipSuccess) {
24598 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->
configuration.
device[0]);
24599 if (res != hipSuccess) {
24604 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->
configuration.
device[0]);
24605 if (res != hipSuccess) {
24612 res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->
configuration.
device[0]);
24613 if (res != hipSuccess) {
24625 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
24627 if (res != hipSuccess) {
24639#elif(VKFFT_BACKEND==3)
24641 if (inputLaunchConfiguration.
device == 0) {
24646 if (inputLaunchConfiguration.context == 0) {
24650 app->
configuration.context = inputLaunchConfiguration.context;
24651 if (inputLaunchConfiguration.platform == 0) {
24655 app->
configuration.platform = inputLaunchConfiguration.platform;
24657 size_t value_int64;
24658 cl_uint value_cl_uint;
24659 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_VENDOR_ID,
sizeof(cl_int), &vendorID, 0);
24664 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(
size_t), &value_int64, 0);
24670 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
sizeof(cl_uint), &value_cl_uint, 0);
24675 size_t* dims = (
size_t*)malloc(
sizeof(
size_t) * value_cl_uint);
24677 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(
size_t) * value_cl_uint, dims, 0);
24696 cl_ulong sharedMemorySize;
24697 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(cl_ulong), &sharedMemorySize, 0);
24704 switch (vendorID) {
24746 if (inputLaunchConfiguration.
FFTdim == 0) {
24751 if (inputLaunchConfiguration.
size[0] == 0) {
24784 for (uint64_t i = 1; i < 3; i++) {
24785 if (inputLaunchConfiguration.
size[i] == 0)
24812#if(VKFFT_BACKEND==0)
24813 if (inputLaunchConfiguration.
bufferSize == 0) {
24834#if(VKFFT_BACKEND==0)
24865#if(VKFFT_BACKEND==0)
24892#if(VKFFT_BACKEND==0)
24918#if(VKFFT_BACKEND==0)
24919 if (inputLaunchConfiguration.
kernelSize == 0) {
24943 uint64_t checkBufferSizeFor64BitAddressing = 0;
24952 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
24953 checkBufferSizeFor64BitAddressing = 0;
24958 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
24960 checkBufferSizeFor64BitAddressing = 0;
24965 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
24967 checkBufferSizeFor64BitAddressing = 0;
24972 if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->
configuration.
useUint64 = 1;
24985 if (inputLaunchConfiguration.
performR2C != 0) {
24988 if (inputLaunchConfiguration.
performDCT != 0) {
25102#if(VKFFT_BACKEND==0)
25215#if(VKFFT_BACKEND==0)
25217 glslang_finalize_process();
25553#if(VKFFT_BACKEND==0)
25555 VkMemoryBarrier memory_barrier = {
25556 VK_STRUCTURE_TYPE_MEMORY_BARRIER,
25558 VK_ACCESS_SHADER_WRITE_BIT,
25559 VK_ACCESS_SHADER_READ_BIT,
25562#elif(VKFFT_BACKEND==1)
25564#elif(VKFFT_BACKEND==2)
25566#elif(VKFFT_BACKEND==3)
25567 app->
configuration.commandQueue = launchParams->commandQueue;
25569 uint64_t localSize0[3];
25574 if (inverse == 1) {
25588 if (inverse != 1) {
25596#if(VKFFT_BACKEND==0)
25598 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25600 uint64_t dispatchBlock[3];
25608 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->
localFFTPlan->
actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (
double)axis->axisBlock[1]));
25622 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25637#if(VKFFT_BACKEND==0)
25639 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25641 uint64_t dispatchBlock[3];
25649 dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->
localFFTPlan->
actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (
double)axis->axisBlock[1]));
25663 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25679#if(VKFFT_BACKEND==0)
25681 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25683 uint64_t dispatchBlock[3];
25686 dispatchBlock[1] = 1;
25708#if(VKFFT_BACKEND==0)
25710 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25712 uint64_t dispatchBlock[3];
25713 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
25714 dispatchBlock[1] = 1;
25732#if(VKFFT_BACKEND==0)
25734 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25736 uint64_t dispatchBlock[3];
25738 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
25739 dispatchBlock[1] = 1;
25755#if(VKFFT_BACKEND==0)
25757 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25759 uint64_t dispatchBlock[3];
25760 dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[1][1] / (
double)axis->specializationConstants.fftDim);
25761 dispatchBlock[1] = 1;
25787#if(VKFFT_BACKEND==0)
25789 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25791 uint64_t dispatchBlock[3];
25792 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
25793 dispatchBlock[1] = 1;
25809#if(VKFFT_BACKEND==0)
25811 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25813 uint64_t dispatchBlock[3];
25814 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
25815 dispatchBlock[1] = 1;
25829#if(VKFFT_BACKEND==0)
25831 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25833 uint64_t dispatchBlock[3];
25834 dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (
double)axis->axisBlock[0] * app->
localFFTPlan->
actualFFTSizePerAxis[2][2] / (
double)axis->specializationConstants.fftDim);
25835 dispatchBlock[1] = 1;
25862#if(VKFFT_BACKEND==0)
25864 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25866 uint64_t dispatchBlock[3];
25868 dispatchBlock[1] = 1;
25884#if(VKFFT_BACKEND==0)
25886 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25888 uint64_t dispatchBlock[3];
25890 dispatchBlock[1] = 1;
25910#if(VKFFT_BACKEND==0)
25912 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25914 uint64_t dispatchBlock[3];
25916 dispatchBlock[1] = 1;
25932#if(VKFFT_BACKEND==0)
25934 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25936 uint64_t dispatchBlock[3];
25958 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25974#if(VKFFT_BACKEND==0)
25976 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
25978 uint64_t dispatchBlock[3];
25980 dispatchBlock[1] = 1;
25993 if (inverse == 1) {
26004#if(VKFFT_BACKEND==0)
26006 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26008 uint64_t dispatchBlock[3];
26010 dispatchBlock[1] = 1;
26035#if(VKFFT_BACKEND==0)
26037 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26039 uint64_t dispatchBlock[3];
26041 dispatchBlock[1] = 1;
26063#if(VKFFT_BACKEND==0)
26065 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26067 uint64_t dispatchBlock[3];
26070 dispatchBlock[1] = 1;
26085#if(VKFFT_BACKEND==0)
26087 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26089 uint64_t dispatchBlock[3];
26111 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26127#if(VKFFT_BACKEND==0)
26129 vkCmdBindDescriptorSets(app->
configuration.
commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0);
26131 uint64_t dispatchBlock[3];
26153 if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);