55 float a11,
float a12,
float a13,
float a21,
float a22,
float a23,
float a31,
float a32,
float a33,
56 float &u11,
float &u12,
float &u13,
float &u21,
float &u22,
float &u23,
float &u31,
float &u32,
float &u33,
62 float &v11,
float &v12,
float &v13,
float &v21,
float &v22,
float &v23,
float &v31,
float &v32,
float &v33
64__device__ __forceinline__
66 float a11,
float a12,
float a13,
float a21,
float a22,
float a23,
float a31,
float a32,
float a33,
67 float& u11,
float& u12,
float& u13,
float& u21,
float& u22,
float& u23,
float& u31,
float& u32,
float& u33,
73 float& v11,
float& v12,
float& v13,
float& v21,
float& v22,
float& v23,
float& v31,
float& v32,
float& v33
77 un Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
78 un Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
79 un Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
81 un Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
82 un Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
83 un Sqvs, Sqvvx, Sqvvy, Sqvvz;
85 Sa11.
f = a11; Sa12.
f = a12; Sa13.
f = a13;
86 Sa21.
f = a21; Sa22.
f = a22; Sa23.
f = a23;
87 Sa31.
f = a31; Sa32.
f = a32; Sa33.
f = a33;
93 Ss11.
f = Sa11.
f*Sa11.
f;
94 Stmp1.
f = Sa21.
f*Sa21.
f;
96 Stmp1.
f = Sa31.
f*Sa31.
f;
99 Ss21.
f = Sa12.
f*Sa11.
f;
100 Stmp1.
f = Sa22.
f*Sa21.
f;
102 Stmp1.
f = Sa32.
f*Sa31.
f;
105 Ss31.
f = Sa13.
f*Sa11.
f;
106 Stmp1.
f = Sa23.
f*Sa21.
f;
108 Stmp1.
f = Sa33.
f*Sa31.
f;
111 Ss22.
f = Sa12.
f*Sa12.
f;
112 Stmp1.
f = Sa22.
f*Sa22.
f;
114 Stmp1.
f = Sa32.
f*Sa32.
f;
117 Ss32.
f = Sa13.
f*Sa12.
f;
118 Stmp1.
f = Sa23.
f*Sa22.
f;
120 Stmp1.
f = Sa33.
f*Sa32.
f;
123 Ss33.
f = Sa13.
f*Sa13.
f;
124 Stmp1.
f = Sa23.
f*Sa23.
f;
126 Stmp1.
f = Sa33.
f*Sa33.
f;
129 Sqvs.
f = 1.f; Sqvvx.
f = 0.f; Sqvvy.
f = 0.f; Sqvvz.
f = 0.f;
134 for (
int i = 0; i < 4; i++)
136 Ssh.
f = Ss21.
f * 0.5f;
139 Stmp2.
f = Ssh.
f*Ssh.
f;
142 Sch.
ui = Stmp1.
ui&Stmp5.
ui;
143 Stmp2.
ui = ~Stmp1.ui&
gone;
144 Sch.
ui = Sch.
ui | Stmp2.
ui;
146 Stmp1.
f = Ssh.
f*Ssh.
f;
147 Stmp2.
f = Sch.
f*Sch.
f;
151 Ssh.
f = Stmp4.
f*Ssh.
f;
152 Sch.
f = Stmp4.
f*Sch.
f;
154 Stmp1.
ui = (Stmp2.
f <= Stmp1.
f) ? 0xffffffff : 0;
157 Ssh.
ui = ~Stmp1.ui&Ssh.
ui;
158 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
160 Sch.
ui = ~Stmp1.ui&Sch.
ui;
161 Sch.
ui = Sch.
ui | Stmp2.
ui;
163 Stmp1.
f = Ssh.
f * Ssh.
f;
164 Stmp2.
f = Sch.
f * Sch.
f;
166 Ss.
f = Sch.
f * Ssh.
f;
169#ifdef DEBUG_JACOBI_CONJUGATE
170 printf(
"GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.
f, Sc.
f, Ssh.
f, Sch.
f);
177 Ss33.
f = Ss33.
f * Stmp3.
f;
178 Ss31.
f = Ss31.
f * Stmp3.
f;
179 Ss32.
f = Ss32.
f * Stmp3.
f;
180 Ss33.
f = Ss33.
f * Stmp3.
f;
182 Stmp1.
f = Ss.
f * Ss31.
f;
183 Stmp2.
f = Ss.
f * Ss32.
f;
184 Ss31.
f = Sc.
f * Ss31.
f;
185 Ss32.
f = Sc.
f * Ss32.
f;
190 Stmp1.
f = Ss22.
f*Stmp2.
f;
191 Stmp3.
f = Ss11.
f*Stmp2.
f;
193 Ss11.
f = Ss11.
f*Stmp4.
f;
194 Ss22.
f = Ss22.
f*Stmp4.
f;
199 Ss21.
f = Ss21.
f*Stmp4.
f;
201 Stmp2.
f = Stmp2.
f*Stmp4.
f;
202 Stmp5.
f = Stmp5.
f*Stmp4.
f;
207#ifdef DEBUG_JACOBI_CONJUGATE
208 printf(
"%.20g\n", Ss11.
f);
209 printf(
"%.20g %.20g\n", Ss21.
f, Ss22.
f);
210 printf(
"%.20g %.20g %.20g\n", Ss31.
f, Ss32.
f, Ss33.
f);
217 Stmp1.
f = Ssh.
f*Sqvvx.
f;
218 Stmp2.
f = Ssh.
f*Sqvvy.
f;
219 Stmp3.
f = Ssh.
f*Sqvvz.
f;
220 Ssh.
f = Ssh.
f*Sqvs.
f;
222 Sqvs.
f = Sch.
f*Sqvs.
f;
223 Sqvvx.
f = Sch.
f*Sqvvx.
f;
224 Sqvvy.
f = Sch.
f*Sqvvy.
f;
225 Sqvvz.
f = Sch.
f*Sqvvz.
f;
232#ifdef DEBUG_JACOBI_CONJUGATE
233 printf(
"GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.
f, Sqvvy.
f, Sqvvz.
f, Sqvs.
f);
239 Ssh.
f = Ss32.
f * 0.5f;
242 Stmp2.
f = Ssh.
f * Ssh.
f;
245 Sch.
ui = Stmp1.
ui&Stmp5.
ui;
246 Stmp2.
ui = ~Stmp1.ui&
gone;
247 Sch.
ui = Sch.
ui | Stmp2.
ui;
249 Stmp1.
f = Ssh.
f * Ssh.
f;
250 Stmp2.
f = Sch.
f * Sch.
f;
254 Ssh.
f = Stmp4.
f * Ssh.
f;
255 Sch.
f = Stmp4.
f * Sch.
f;
257 Stmp1.
ui = (Stmp2.
f <= Stmp1.
f) ? 0xffffffff : 0;
260 Ssh.
ui = ~Stmp1.ui&Ssh.
ui;
261 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
263 Sch.
ui = ~Stmp1.ui&Sch.
ui;
264 Sch.
ui = Sch.
ui | Stmp2.
ui;
266 Stmp1.
f = Ssh.
f * Ssh.
f;
267 Stmp2.
f = Sch.
f * Sch.
f;
272#ifdef DEBUG_JACOBI_CONJUGATE
273 printf(
"GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.
f, Sc.
f, Ssh.
f, Sch.
f);
281 Ss11.
f = Ss11.
f * Stmp3.
f;
282 Ss21.
f = Ss21.
f * Stmp3.
f;
283 Ss31.
f = Ss31.
f * Stmp3.
f;
284 Ss11.
f = Ss11.
f * Stmp3.
f;
286 Stmp1.
f = Ss.
f*Ss21.
f;
287 Stmp2.
f = Ss.
f*Ss31.
f;
288 Ss21.
f = Sc.
f*Ss21.
f;
289 Ss31.
f = Sc.
f*Ss31.
f;
294 Stmp1.
f = Ss33.
f*Stmp2.
f;
295 Stmp3.
f = Ss22.
f*Stmp2.
f;
296 Stmp4.
f = Sc.
f * Sc.
f;
297 Ss22.
f = Ss22.
f * Stmp4.
f;
298 Ss33.
f = Ss33.
f * Stmp4.
f;
303 Ss32.
f = Ss32.
f*Stmp4.
f;
305 Stmp2.
f = Stmp2.
f*Stmp4.
f;
306 Stmp5.
f = Stmp5.
f*Stmp4.
f;
311#ifdef DEBUG_JACOBI_CONJUGATE
312 printf(
"%.20g\n", Ss11.
f);
313 printf(
"%.20g %.20g\n", Ss21.
f, Ss22.
f);
314 printf(
"%.20g %.20g %.20g\n", Ss31.
f, Ss32.
f, Ss33.
f);
321 Stmp1.
f = Ssh.
f*Sqvvx.
f;
322 Stmp2.
f = Ssh.
f*Sqvvy.
f;
323 Stmp3.
f = Ssh.
f*Sqvvz.
f;
324 Ssh.
f = Ssh.
f*Sqvs.
f;
326 Sqvs.
f = Sch.
f*Sqvs.
f;
327 Sqvvx.
f = Sch.
f*Sqvvx.
f;
328 Sqvvy.
f = Sch.
f*Sqvvy.
f;
329 Sqvvz.
f = Sch.
f*Sqvvz.
f;
336#ifdef DEBUG_JACOBI_CONJUGATE
337 printf(
"GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.
f, Sqvvy.
f, Sqvvz.
f, Sqvs.
f);
344 Ssh.
f = Ss31.
f * 0.5f;
347 Stmp2.
f = Ssh.
f*Ssh.
f;
350 Sch.
ui = Stmp1.
ui&Stmp5.
ui;
351 Stmp2.
ui = ~Stmp1.ui&
gone;
352 Sch.
ui = Sch.
ui | Stmp2.
ui;
354 Stmp1.
f = Ssh.
f*Ssh.
f;
355 Stmp2.
f = Sch.
f*Sch.
f;
359 Ssh.
f = Stmp4.
f*Ssh.
f;
360 Sch.
f = Stmp4.
f*Sch.
f;
362 Stmp1.
ui = (Stmp2.
f <= Stmp1.
f) ? 0xffffffff : 0;
365 Ssh.
ui = ~Stmp1.ui&Ssh.
ui;
366 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
368 Sch.
ui = ~Stmp1.ui&Sch.
ui;
369 Sch.
ui = Sch.
ui | Stmp2.
ui;
371 Stmp1.
f = Ssh.
f*Ssh.
f;
372 Stmp2.
f = Sch.
f*Sch.
f;
377#ifdef DEBUG_JACOBI_CONJUGATE
378 printf(
"GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.
f, Sc.
f, Ssh.
f, Sch.
f);
386 Ss22.
f = Ss22.
f * Stmp3.
f;
387 Ss32.
f = Ss32.
f * Stmp3.
f;
388 Ss21.
f = Ss21.
f * Stmp3.
f;
389 Ss22.
f = Ss22.
f * Stmp3.
f;
391 Stmp1.
f = Ss.
f*Ss32.
f;
392 Stmp2.
f = Ss.
f*Ss21.
f;
393 Ss32.
f = Sc.
f*Ss32.
f;
394 Ss21.
f = Sc.
f*Ss21.
f;
399 Stmp1.
f = Ss11.
f*Stmp2.
f;
400 Stmp3.
f = Ss33.
f*Stmp2.
f;
402 Ss33.
f = Ss33.
f*Stmp4.
f;
403 Ss11.
f = Ss11.
f*Stmp4.
f;
408 Ss31.
f = Ss31.
f*Stmp4.
f;
410 Stmp2.
f = Stmp2.
f*Stmp4.
f;
411 Stmp5.
f = Stmp5.
f*Stmp4.
f;
416#ifdef DEBUG_JACOBI_CONJUGATE
417 printf(
"%.20g\n", Ss11.
f);
418 printf(
"%.20g %.20g\n", Ss21.
f, Ss22.
f);
419 printf(
"%.20g %.20g %.20g\n", Ss31.
f, Ss32.
f, Ss33.
f);
426 Stmp1.
f = Ssh.
f*Sqvvx.
f;
427 Stmp2.
f = Ssh.
f*Sqvvy.
f;
428 Stmp3.
f = Ssh.
f*Sqvvz.
f;
429 Ssh.
f = Ssh.
f*Sqvs.
f;
431 Sqvs.
f = Sch.
f*Sqvs.
f;
432 Sqvvx.
f = Sch.
f*Sqvvx.
f;
433 Sqvvy.
f = Sch.
f*Sqvvy.
f;
434 Sqvvz.
f = Sch.
f*Sqvvz.
f;
447 Stmp2.
f = Sqvs.
f*Sqvs.
f;
448 Stmp1.
f = Sqvvx.
f*Sqvvx.
f;
450 Stmp1.
f = Sqvvy.
f*Sqvvy.
f;
452 Stmp1.
f = Sqvvz.
f*Sqvvz.
f;
456 Stmp4.
f = Stmp1.
f*0.5f;
457 Stmp3.
f = Stmp1.
f*Stmp4.
f;
458 Stmp3.
f = Stmp1.
f*Stmp3.
f;
459 Stmp3.
f = Stmp2.
f*Stmp3.
f;
463 Sqvs.
f = Sqvs.
f*Stmp1.
f;
464 Sqvvx.
f = Sqvvx.
f*Stmp1.
f;
465 Sqvvy.
f = Sqvvy.
f*Stmp1.
f;
466 Sqvvz.
f = Sqvvz.
f*Stmp1.
f;
472 Stmp1.
f = Sqvvx.
f*Sqvvx.
f;
473 Stmp2.
f = Sqvvy.
f*Sqvvy.
f;
474 Stmp3.
f = Sqvvz.
f*Sqvvz.
f;
475 Sv11.
f = Sqvs.
f*Sqvs.
f;
487 Sv32.
f = Sqvs.
f*Stmp1.
f;
488 Sv13.
f = Sqvs.
f*Stmp2.
f;
489 Sv21.
f = Sqvs.
f*Stmp3.
f;
490 Stmp1.
f = Sqvvy.
f*Stmp1.
f;
491 Stmp2.
f = Sqvvz.
f*Stmp2.
f;
492 Stmp3.
f = Sqvvx.
f*Stmp3.
f;
506 Sa12.
f = Sv12.
f*Sa11.
f;
507 Sa13.
f = Sv13.
f*Sa11.
f;
508 Sa11.
f = Sv11.
f*Sa11.
f;
509 Stmp1.
f = Sv21.
f*Stmp2.
f;
511 Stmp1.
f = Sv31.
f*Stmp3.
f;
513 Stmp1.
f = Sv22.
f*Stmp2.
f;
515 Stmp1.
f = Sv32.
f*Stmp3.
f;
517 Stmp1.
f = Sv23.
f*Stmp2.
f;
519 Stmp1.
f = Sv33.
f*Stmp3.
f;
524 Sa22.
f = Sv12.
f*Sa21.
f;
525 Sa23.
f = Sv13.
f*Sa21.
f;
526 Sa21.
f = Sv11.
f*Sa21.
f;
527 Stmp1.
f = Sv21.
f*Stmp2.
f;
529 Stmp1.
f = Sv31.
f*Stmp3.
f;
531 Stmp1.
f = Sv22.
f*Stmp2.
f;
533 Stmp1.
f = Sv32.
f*Stmp3.
f;
535 Stmp1.
f = Sv23.
f*Stmp2.
f;
537 Stmp1.
f = Sv33.
f*Stmp3.
f;
542 Sa32.
f = Sv12.
f*Sa31.
f;
543 Sa33.
f = Sv13.
f*Sa31.
f;
544 Sa31.
f = Sv11.
f*Sa31.
f;
545 Stmp1.
f = Sv21.
f*Stmp2.
f;
547 Stmp1.
f = Sv31.
f*Stmp3.
f;
549 Stmp1.
f = Sv22.
f*Stmp2.
f;
551 Stmp1.
f = Sv32.
f*Stmp3.
f;
553 Stmp1.
f = Sv23.
f*Stmp2.
f;
555 Stmp1.
f = Sv33.
f*Stmp3.
f;
562 Stmp1.
f = Sa11.
f*Sa11.
f;
563 Stmp4.
f = Sa21.
f*Sa21.
f;
565 Stmp4.
f = Sa31.
f*Sa31.
f;
568 Stmp2.
f = Sa12.
f*Sa12.
f;
569 Stmp4.
f = Sa22.
f*Sa22.
f;
571 Stmp4.
f = Sa32.
f*Sa32.
f;
574 Stmp3.
f = Sa13.
f*Sa13.
f;
575 Stmp4.
f = Sa23.
f*Sa23.
f;
577 Stmp4.
f = Sa33.
f*Sa33.
f;
581 Stmp4.
ui = (Stmp1.
f < Stmp2.
f) ? 0xffffffff : 0;
582 Stmp5.
ui = Sa11.
ui^Sa12.
ui;
583 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
584 Sa11.
ui = Sa11.
ui^Stmp5.
ui;
585 Sa12.
ui = Sa12.
ui^Stmp5.
ui;
587 Stmp5.
ui = Sa21.
ui^Sa22.
ui;
588 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
589 Sa21.
ui = Sa21.
ui^Stmp5.
ui;
590 Sa22.
ui = Sa22.
ui^Stmp5.
ui;
592 Stmp5.
ui = Sa31.
ui^Sa32.
ui;
593 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
594 Sa31.
ui = Sa31.
ui^Stmp5.
ui;
595 Sa32.
ui = Sa32.
ui^Stmp5.
ui;
597 Stmp5.
ui = Sv11.
ui^Sv12.
ui;
598 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
599 Sv11.
ui = Sv11.
ui^Stmp5.
ui;
600 Sv12.
ui = Sv12.
ui^Stmp5.
ui;
602 Stmp5.
ui = Sv21.
ui^Sv22.
ui;
603 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
604 Sv21.
ui = Sv21.
ui^Stmp5.
ui;
605 Sv22.
ui = Sv22.
ui^Stmp5.
ui;
607 Stmp5.
ui = Sv31.
ui^Sv32.
ui;
608 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
609 Sv31.
ui = Sv31.
ui^Stmp5.
ui;
610 Sv32.
ui = Sv32.
ui^Stmp5.
ui;
612 Stmp5.
ui = Stmp1.
ui^Stmp2.
ui;
613 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
614 Stmp1.
ui = Stmp1.
ui^Stmp5.
ui;
615 Stmp2.
ui = Stmp2.
ui^Stmp5.
ui;
620 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
624 Sa12.
f = Sa12.
f*Stmp4.
f;
625 Sa22.
f = Sa22.
f*Stmp4.
f;
626 Sa32.
f = Sa32.
f*Stmp4.
f;
628 Sv12.
f = Sv12.
f*Stmp4.
f;
629 Sv22.
f = Sv22.
f*Stmp4.
f;
630 Sv32.
f = Sv32.
f*Stmp4.
f;
634 Stmp4.
ui = (Stmp1.
f < Stmp3.
f) ? 0xffffffff : 0;
635 Stmp5.
ui = Sa11.
ui^Sa13.
ui;
636 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
637 Sa11.
ui = Sa11.
ui^Stmp5.
ui;
638 Sa13.
ui = Sa13.
ui^Stmp5.
ui;
640 Stmp5.
ui = Sa21.
ui^Sa23.
ui;
641 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
642 Sa21.
ui = Sa21.
ui^Stmp5.
ui;
643 Sa23.
ui = Sa23.
ui^Stmp5.
ui;
645 Stmp5.
ui = Sa31.
ui^Sa33.
ui;
646 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
647 Sa31.
ui = Sa31.
ui^Stmp5.
ui;
648 Sa33.
ui = Sa33.
ui^Stmp5.
ui;
650 Stmp5.
ui = Sv11.
ui^Sv13.
ui;
651 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
652 Sv11.
ui = Sv11.
ui^Stmp5.
ui;
653 Sv13.
ui = Sv13.
ui^Stmp5.
ui;
655 Stmp5.
ui = Sv21.
ui^Sv23.
ui;
656 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
657 Sv21.
ui = Sv21.
ui^Stmp5.
ui;
658 Sv23.
ui = Sv23.
ui^Stmp5.
ui;
660 Stmp5.
ui = Sv31.
ui^Sv33.
ui;
661 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
662 Sv31.
ui = Sv31.
ui^Stmp5.
ui;
663 Sv33.
ui = Sv33.
ui^Stmp5.
ui;
665 Stmp5.
ui = Stmp1.
ui^Stmp3.
ui;
666 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
667 Stmp1.
ui = Stmp1.
ui^Stmp5.
ui;
668 Stmp3.
ui = Stmp3.
ui^Stmp5.
ui;
673 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
677 Sa11.
f = Sa11.
f*Stmp4.
f;
678 Sa21.
f = Sa21.
f*Stmp4.
f;
679 Sa31.
f = Sa31.
f*Stmp4.
f;
681 Sv11.
f = Sv11.
f*Stmp4.
f;
682 Sv21.
f = Sv21.
f*Stmp4.
f;
683 Sv31.
f = Sv31.
f*Stmp4.
f;
687 Stmp4.
ui = (Stmp2.
f < Stmp3.
f) ? 0xffffffff : 0;
688 Stmp5.
ui = Sa12.
ui^Sa13.
ui;
689 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
690 Sa12.
ui = Sa12.
ui^Stmp5.
ui;
691 Sa13.
ui = Sa13.
ui^Stmp5.
ui;
693 Stmp5.
ui = Sa22.
ui^Sa23.
ui;
694 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
695 Sa22.
ui = Sa22.
ui^Stmp5.
ui;
696 Sa23.
ui = Sa23.
ui^Stmp5.
ui;
698 Stmp5.
ui = Sa32.
ui^Sa33.
ui;
699 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
700 Sa32.
ui = Sa32.
ui^Stmp5.
ui;
701 Sa33.
ui = Sa33.
ui^Stmp5.
ui;
703 Stmp5.
ui = Sv12.
ui^Sv13.
ui;
704 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
705 Sv12.
ui = Sv12.
ui^Stmp5.
ui;
706 Sv13.
ui = Sv13.
ui^Stmp5.
ui;
708 Stmp5.
ui = Sv22.
ui^Sv23.
ui;
709 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
710 Sv22.
ui = Sv22.
ui^Stmp5.
ui;
711 Sv23.
ui = Sv23.
ui^Stmp5.
ui;
713 Stmp5.
ui = Sv32.
ui^Sv33.
ui;
714 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
715 Sv32.
ui = Sv32.
ui^Stmp5.
ui;
716 Sv33.
ui = Sv33.
ui^Stmp5.
ui;
718 Stmp5.
ui = Stmp2.
ui^Stmp3.
ui;
719 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
720 Stmp2.
ui = Stmp2.
ui^Stmp5.
ui;
721 Stmp3.
ui = Stmp3.
ui^Stmp5.
ui;
726 Stmp5.
ui = Stmp5.
ui&Stmp4.
ui;
730 Sa13.
f = Sa13.
f*Stmp4.
f;
731 Sa23.
f = Sa23.
f*Stmp4.
f;
732 Sa33.
f = Sa33.
f*Stmp4.
f;
734 Sv13.
f = Sv13.
f*Stmp4.
f;
735 Sv23.
f = Sv23.
f*Stmp4.
f;
736 Sv33.
f = Sv33.
f*Stmp4.
f;
742 Su11.
f = 1.f; Su12.
f = 0.f; Su13.
f = 0.f;
743 Su21.
f = 0.f; Su22.
f = 1.f; Su23.
f = 0.f;
744 Su31.
f = 0.f; Su32.
f = 0.f; Su33.
f = 1.f;
746 Ssh.
f = Sa21.
f*Sa21.
f;
752 Sch.
f =
max(Sch.
f, Sa11.
f);
754 Stmp5.
ui = (Sa11.
f >= Stmp5.
f) ? 0xffffffff : 0;
756 Stmp1.
f = Sch.
f*Sch.
f;
757 Stmp2.
f = Ssh.
f*Ssh.
f;
761 Stmp4.
f = Stmp1.
f*0.5f;
762 Stmp3.
f = Stmp1.
f*Stmp4.
f;
763 Stmp3.
f = Stmp1.
f*Stmp3.
f;
764 Stmp3.
f = Stmp2.
f*Stmp3.
f;
767 Stmp1.
f = Stmp1.
f*Stmp2.
f;
771 Stmp1.
ui = ~Stmp5.ui&Ssh.
ui;
772 Stmp2.
ui = ~Stmp5.ui&Sch.
ui;
775 Sch.
ui = Sch.
ui | Stmp1.
ui;
776 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
778 Stmp1.
f = Sch.
f*Sch.
f;
779 Stmp2.
f = Ssh.
f*Ssh.
f;
783 Stmp4.
f = Stmp1.
f*0.5f;
784 Stmp3.
f = Stmp1.
f*Stmp4.
f;
785 Stmp3.
f = Stmp1.
f*Stmp3.
f;
786 Stmp3.
f = Stmp2.
f*Stmp3.
f;
790 Sch.
f = Sch.
f*Stmp1.
f;
791 Ssh.
f = Ssh.
f*Stmp1.
f;
803 Stmp1.
f = Ss.
f*Sa11.
f;
804 Stmp2.
f = Ss.
f*Sa21.
f;
805 Sa11.
f = Sc.
f*Sa11.
f;
806 Sa21.
f = Sc.
f*Sa21.
f;
810 Stmp1.
f = Ss.
f*Sa12.
f;
811 Stmp2.
f = Ss.
f*Sa22.
f;
812 Sa12.
f = Sc.
f*Sa12.
f;
813 Sa22.
f = Sc.
f*Sa22.
f;
817 Stmp1.
f = Ss.
f*Sa13.
f;
818 Stmp2.
f = Ss.
f*Sa23.
f;
819 Sa13.
f = Sc.
f*Sa13.
f;
820 Sa23.
f = Sc.
f*Sa23.
f;
828 Stmp1.
f = Ss.
f*Su11.
f;
829 Stmp2.
f = Ss.
f*Su12.
f;
830 Su11.
f = Sc.
f*Su11.
f;
831 Su12.
f = Sc.
f*Su12.
f;
835 Stmp1.
f = Ss.
f*Su21.
f;
836 Stmp2.
f = Ss.
f*Su22.
f;
837 Su21.
f = Sc.
f*Su21.
f;
838 Su22.
f = Sc.
f*Su22.
f;
842 Stmp1.
f = Ss.
f*Su31.
f;
843 Stmp2.
f = Ss.
f*Su32.
f;
844 Su31.
f = Sc.
f*Su31.
f;
845 Su32.
f = Sc.
f*Su32.
f;
851 Ssh.
f = Sa31.
f*Sa31.
f;
857 Sch.
f =
max(Sch.
f, Sa11.
f);
859 Stmp5.
ui = (Sa11.
f >= Stmp5.
f) ? 0xffffffff : 0;
861 Stmp1.
f = Sch.
f*Sch.
f;
862 Stmp2.
f = Ssh.
f*Ssh.
f;
866 Stmp4.
f = Stmp1.
f*0.5;
867 Stmp3.
f = Stmp1.
f*Stmp4.
f;
868 Stmp3.
f = Stmp1.
f*Stmp3.
f;
869 Stmp3.
f = Stmp2.
f*Stmp3.
f;
872 Stmp1.
f = Stmp1.
f*Stmp2.
f;
876 Stmp1.
ui = ~Stmp5.ui&Ssh.
ui;
877 Stmp2.
ui = ~Stmp5.ui&Sch.
ui;
880 Sch.
ui = Sch.
ui | Stmp1.
ui;
881 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
883 Stmp1.
f = Sch.
f*Sch.
f;
884 Stmp2.
f = Ssh.
f*Ssh.
f;
888 Stmp4.
f = Stmp1.
f*0.5f;
889 Stmp3.
f = Stmp1.
f*Stmp4.
f;
890 Stmp3.
f = Stmp1.
f*Stmp3.
f;
891 Stmp3.
f = Stmp2.
f*Stmp3.
f;
895 Sch.
f = Sch.
f*Stmp1.
f;
896 Ssh.
f = Ssh.
f*Stmp1.
f;
908 Stmp1.
f = Ss.
f*Sa11.
f;
909 Stmp2.
f = Ss.
f*Sa31.
f;
910 Sa11.
f = Sc.
f*Sa11.
f;
911 Sa31.
f = Sc.
f*Sa31.
f;
915 Stmp1.
f = Ss.
f*Sa12.
f;
916 Stmp2.
f = Ss.
f*Sa32.
f;
917 Sa12.
f = Sc.
f*Sa12.
f;
918 Sa32.
f = Sc.
f*Sa32.
f;
922 Stmp1.
f = Ss.
f*Sa13.
f;
923 Stmp2.
f = Ss.
f*Sa33.
f;
924 Sa13.
f = Sc.
f*Sa13.
f;
925 Sa33.
f = Sc.
f*Sa33.
f;
933 Stmp1.
f = Ss.
f*Su11.
f;
934 Stmp2.
f = Ss.
f*Su13.
f;
935 Su11.
f = Sc.
f*Su11.
f;
936 Su13.
f = Sc.
f*Su13.
f;
940 Stmp1.
f = Ss.
f*Su21.
f;
941 Stmp2.
f = Ss.
f*Su23.
f;
942 Su21.
f = Sc.
f*Su21.
f;
943 Su23.
f = Sc.
f*Su23.
f;
947 Stmp1.
f = Ss.
f*Su31.
f;
948 Stmp2.
f = Ss.
f*Su33.
f;
949 Su31.
f = Sc.
f*Su31.
f;
950 Su33.
f = Sc.
f*Su33.
f;
956 Ssh.
f = Sa32.
f*Sa32.
f;
962 Sch.
f =
max(Sch.
f, Sa22.
f);
964 Stmp5.
ui = (Sa22.
f >= Stmp5.
f) ? 0xffffffff : 0;
966 Stmp1.
f = Sch.
f*Sch.
f;
967 Stmp2.
f = Ssh.
f*Ssh.
f;
971 Stmp4.
f = Stmp1.
f*0.5f;
972 Stmp3.
f = Stmp1.
f*Stmp4.
f;
973 Stmp3.
f = Stmp1.
f*Stmp3.
f;
974 Stmp3.
f = Stmp2.
f*Stmp3.
f;
977 Stmp1.
f = Stmp1.
f*Stmp2.
f;
981 Stmp1.
ui = ~Stmp5.ui&Ssh.
ui;
982 Stmp2.
ui = ~Stmp5.ui&Sch.
ui;
985 Sch.
ui = Sch.
ui | Stmp1.
ui;
986 Ssh.
ui = Ssh.
ui | Stmp2.
ui;
988 Stmp1.
f = Sch.
f*Sch.
f;
989 Stmp2.
f = Ssh.
f*Ssh.
f;
993 Stmp4.
f = Stmp1.
f*0.5f;
994 Stmp3.
f = Stmp1.
f*Stmp4.
f;
995 Stmp3.
f = Stmp1.
f*Stmp3.
f;
996 Stmp3.
f = Stmp2.
f*Stmp3.
f;
1000 Sch.
f = Sch.
f*Stmp1.
f;
1001 Ssh.
f = Ssh.
f*Stmp1.
f;
1013 Stmp1.
f = Ss.
f*Sa21.
f;
1014 Stmp2.
f = Ss.
f*Sa31.
f;
1015 Sa21.
f = Sc.
f*Sa21.
f;
1016 Sa31.
f = Sc.
f*Sa31.
f;
1020 Stmp1.
f = Ss.
f*Sa22.
f;
1021 Stmp2.
f = Ss.
f*Sa32.
f;
1022 Sa22.
f = Sc.
f*Sa22.
f;
1023 Sa32.
f = Sc.
f*Sa32.
f;
1027 Stmp1.
f = Ss.
f*Sa23.
f;
1028 Stmp2.
f = Ss.
f*Sa33.
f;
1029 Sa23.
f = Sc.
f*Sa23.
f;
1030 Sa33.
f = Sc.
f*Sa33.
f;
1038 Stmp1.
f = Ss.
f*Su12.
f;
1039 Stmp2.
f = Ss.
f*Su13.
f;
1040 Su12.
f = Sc.
f*Su12.
f;
1041 Su13.
f = Sc.
f*Su13.
f;
1045 Stmp1.
f = Ss.
f*Su22.
f;
1046 Stmp2.
f = Ss.
f*Su23.
f;
1047 Su22.
f = Sc.
f*Su22.
f;
1048 Su23.
f = Sc.
f*Su23.
f;
1052 Stmp1.
f = Ss.
f*Su32.
f;
1053 Stmp2.
f = Ss.
f*Su33.
f;
1054 Su32.
f = Sc.
f*Su32.
f;
1055 Su33.
f = Sc.
f*Su33.
f;
1059 v11 = Sv11.
f; v12 = Sv12.
f; v13 = Sv13.
f;
1060 v21 = Sv21.
f; v22 = Sv22.
f; v23 = Sv23.
f;
1061 v31 = Sv31.
f; v32 = Sv32.
f; v33 = Sv33.
f;
1063 u11 = Su11.
f; u12 = Su12.
f; u13 = Su13.
f;
1064 u21 = Su21.
f; u22 = Su22.
f; u23 = Su23.
f;
1065 u31 = Su31.
f; u32 = Su32.
f; u33 = Su33.
f;
__host__ __forceinline__ void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)