PeriDyno 1.0.0
Loading...
Searching...
No Matches
svd3_cuda.h
Go to the documentation of this file.
1/**************************************************************************
2**
3** svd3
4**
5** Quick singular value decomposition as described by:
6** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
7** Computing the Singular Value Decomposition of 3x3 Matrix
8** with minimal branching and elementary floating point operations,
9** University of Wisconsin - Madison technical report TR1690, May 2011
10**
11** Identical GPU version
12** Implementated by: Kui Wu
13** kwu@cs.utah.edu
14**
15** May 2018
16**
17** Modified to support both CPU and GPU, implemented by Xiaowei He
18** October, 2023
19**
20**************************************************************************/
21
22#ifndef SVD3_CUDA2_H
23#define SVD3_CUDA2_H
24
25#ifdef __NVCC__
26#include <cuda.h>
27#endif
28#include "math.h" // CUDA math library
29
30#define gone 1065353216
31#define gsine_pi_over_eight 1053028117
32#define gcosine_pi_over_eight 1064076127
33#define gone_half 0.5f
34#define gsmall_number 1.e-12f
35#define gtiny_number 1.e-20f
36#define gfour_gamma_squared 5.8284273147583007813f
37
38union un { float f; unsigned int ui; };
39
40#ifndef __NVCC__
41#define max(x, y) (x > y ? x : y)
42#define fadd_rn(x, y) (x + y)
43#define fsub_rn(x, y) (x - y)
44#define frsqrt_rn(x) (1.0 / sqrt(x))
45#else
46#define max(x, y) (x > y ? x : y)
47#define fadd_rn(x, y) __fadd_rn(x, y)
48#define fsub_rn(x, y) __fsub_rn(x, y)
49#define frsqrt_rn(x) __frsqrt_rn(x)
50#endif // __NVCC__
51
52#ifndef __NVCC__
53__host__ __forceinline__
54void svd(
55 float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // input A
56 float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, // output U
57 float &s11,
58 //float &s12, float &s13, float &s21,
59 float &s22,
60 //float &s23, float &s31, float &s32,
61 float &s33, // output S
62 float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33 // output V
63#else
64__device__ __forceinline__
65void svd(
66 float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // input A
67 float& u11, float& u12, float& u13, float& u21, float& u22, float& u23, float& u31, float& u32, float& u33, // output U
68 float& s11,
69 //float &s12, float &s13, float &s21,
70 float& s22,
71 //float &s23, float &s31, float &s32,
72 float& s33, // output S
73 float& v11, float& v12, float& v13, float& v21, float& v22, float& v23, float& v31, float& v32, float& v33 // output V
74#endif
75)
76{
77 un Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
78 un Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
79 un Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
80 un Sc, Ss, Sch, Ssh;
81 un Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
82 un Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
83 un Sqvs, Sqvvx, Sqvvy, Sqvvz;
84
85 Sa11.f = a11; Sa12.f = a12; Sa13.f = a13;
86 Sa21.f = a21; Sa22.f = a22; Sa23.f = a23;
87 Sa31.f = a31; Sa32.f = a32; Sa33.f = a33;
88
89 //###########################################################
90 // Compute normal equations matrix
91 //###########################################################
92
93 Ss11.f = Sa11.f*Sa11.f;
94 Stmp1.f = Sa21.f*Sa21.f;
95 Ss11.f = fadd_rn(Stmp1.f, Ss11.f);
96 Stmp1.f = Sa31.f*Sa31.f;
97 Ss11.f = fadd_rn(Stmp1.f, Ss11.f);
98
99 Ss21.f = Sa12.f*Sa11.f;
100 Stmp1.f = Sa22.f*Sa21.f;
101 Ss21.f = fadd_rn(Stmp1.f, Ss21.f);
102 Stmp1.f = Sa32.f*Sa31.f;
103 Ss21.f = fadd_rn(Stmp1.f, Ss21.f);
104
105 Ss31.f = Sa13.f*Sa11.f;
106 Stmp1.f = Sa23.f*Sa21.f;
107 Ss31.f = fadd_rn(Stmp1.f, Ss31.f);
108 Stmp1.f = Sa33.f*Sa31.f;
109 Ss31.f = fadd_rn(Stmp1.f, Ss31.f);
110
111 Ss22.f = Sa12.f*Sa12.f;
112 Stmp1.f = Sa22.f*Sa22.f;
113 Ss22.f = fadd_rn(Stmp1.f, Ss22.f);
114 Stmp1.f = Sa32.f*Sa32.f;
115 Ss22.f = fadd_rn(Stmp1.f, Ss22.f);
116
117 Ss32.f = Sa13.f*Sa12.f;
118 Stmp1.f = Sa23.f*Sa22.f;
119 Ss32.f = fadd_rn(Stmp1.f, Ss32.f);
120 Stmp1.f = Sa33.f*Sa32.f;
121 Ss32.f = fadd_rn(Stmp1.f, Ss32.f);
122
123 Ss33.f = Sa13.f*Sa13.f;
124 Stmp1.f = Sa23.f*Sa23.f;
125 Ss33.f = fadd_rn(Stmp1.f, Ss33.f);
126 Stmp1.f = Sa33.f*Sa33.f;
127 Ss33.f = fadd_rn(Stmp1.f, Ss33.f);
128
129 Sqvs.f = 1.f; Sqvvx.f = 0.f; Sqvvy.f = 0.f; Sqvvz.f = 0.f;
130
131 //###########################################################
132 // Solve symmetric eigenproblem using Jacobi iteration
133 //###########################################################
134 for (int i = 0; i < 4; i++)
135 {
136 Ssh.f = Ss21.f * 0.5f;
137 Stmp5.f = fsub_rn(Ss11.f, Ss22.f);
138
139 Stmp2.f = Ssh.f*Ssh.f;
140 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
141 Ssh.ui = Stmp1.ui&Ssh.ui;
142 Sch.ui = Stmp1.ui&Stmp5.ui;
143 Stmp2.ui = ~Stmp1.ui&gone;
144 Sch.ui = Sch.ui | Stmp2.ui;
145
146 Stmp1.f = Ssh.f*Ssh.f;
147 Stmp2.f = Sch.f*Sch.f;
148 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
149 Stmp4.f = frsqrt_rn(Stmp3.f);
150
151 Ssh.f = Stmp4.f*Ssh.f;
152 Sch.f = Stmp4.f*Sch.f;
153 Stmp1.f = gfour_gamma_squared*Stmp1.f;
154 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
155
156 Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;
157 Ssh.ui = ~Stmp1.ui&Ssh.ui;
158 Ssh.ui = Ssh.ui | Stmp2.ui;
159 Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;
160 Sch.ui = ~Stmp1.ui&Sch.ui;
161 Sch.ui = Sch.ui | Stmp2.ui;
162
163 Stmp1.f = Ssh.f * Ssh.f;
164 Stmp2.f = Sch.f * Sch.f;
165 Sc.f = fsub_rn(Stmp2.f, Stmp1.f);
166 Ss.f = Sch.f * Ssh.f;
167 Ss.f = fadd_rn(Ss.f, Ss.f);
168
169#ifdef DEBUG_JACOBI_CONJUGATE
170 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
171#endif
172 //###########################################################
173 // Perform the actual Givens conjugation
174 //###########################################################
175
176 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
177 Ss33.f = Ss33.f * Stmp3.f;
178 Ss31.f = Ss31.f * Stmp3.f;
179 Ss32.f = Ss32.f * Stmp3.f;
180 Ss33.f = Ss33.f * Stmp3.f;
181
182 Stmp1.f = Ss.f * Ss31.f;
183 Stmp2.f = Ss.f * Ss32.f;
184 Ss31.f = Sc.f * Ss31.f;
185 Ss32.f = Sc.f * Ss32.f;
186 Ss31.f = fadd_rn(Stmp2.f, Ss31.f);
187 Ss32.f = fsub_rn(Ss32.f, Stmp1.f);
188
189 Stmp2.f = Ss.f*Ss.f;
190 Stmp1.f = Ss22.f*Stmp2.f;
191 Stmp3.f = Ss11.f*Stmp2.f;
192 Stmp4.f = Sc.f*Sc.f;
193 Ss11.f = Ss11.f*Stmp4.f;
194 Ss22.f = Ss22.f*Stmp4.f;
195 Ss11.f = fadd_rn(Ss11.f, Stmp1.f);
196 Ss22.f = fadd_rn(Ss22.f, Stmp3.f);
197 Stmp4.f = fsub_rn(Stmp4.f, Stmp2.f);
198 Stmp2.f = fadd_rn(Ss21.f, Ss21.f);
199 Ss21.f = Ss21.f*Stmp4.f;
200 Stmp4.f = Sc.f*Ss.f;
201 Stmp2.f = Stmp2.f*Stmp4.f;
202 Stmp5.f = Stmp5.f*Stmp4.f;
203 Ss11.f = fadd_rn(Ss11.f, Stmp2.f);
204 Ss21.f = fsub_rn(Ss21.f, Stmp5.f);
205 Ss22.f = fsub_rn(Ss22.f, Stmp2.f);
206
207#ifdef DEBUG_JACOBI_CONJUGATE
208 printf("%.20g\n", Ss11.f);
209 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
210 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
211#endif
212
213 //###########################################################
214 // Compute the cumulative rotation, in quaternion form
215 //###########################################################
216
217 Stmp1.f = Ssh.f*Sqvvx.f;
218 Stmp2.f = Ssh.f*Sqvvy.f;
219 Stmp3.f = Ssh.f*Sqvvz.f;
220 Ssh.f = Ssh.f*Sqvs.f;
221
222 Sqvs.f = Sch.f*Sqvs.f;
223 Sqvvx.f = Sch.f*Sqvvx.f;
224 Sqvvy.f = Sch.f*Sqvvy.f;
225 Sqvvz.f = Sch.f*Sqvvz.f;
226
227 Sqvvz.f = fadd_rn(Sqvvz.f, Ssh.f);
228 Sqvs.f = fsub_rn(Sqvs.f, Stmp3.f);
229 Sqvvx.f = fadd_rn(Sqvvx.f, Stmp2.f);
230 Sqvvy.f = fsub_rn(Sqvvy.f, Stmp1.f);
231
232#ifdef DEBUG_JACOBI_CONJUGATE
233 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f, Sqvs.f);
234#endif
235
237 // (1->3)
239 Ssh.f = Ss32.f * 0.5f;
240 Stmp5.f = fsub_rn(Ss22.f, Ss33.f);
241
242 Stmp2.f = Ssh.f * Ssh.f;
243 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
244 Ssh.ui = Stmp1.ui&Ssh.ui;
245 Sch.ui = Stmp1.ui&Stmp5.ui;
246 Stmp2.ui = ~Stmp1.ui&gone;
247 Sch.ui = Sch.ui | Stmp2.ui;
248
249 Stmp1.f = Ssh.f * Ssh.f;
250 Stmp2.f = Sch.f * Sch.f;
251 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
252 Stmp4.f = frsqrt_rn(Stmp3.f);
253
254 Ssh.f = Stmp4.f * Ssh.f;
255 Sch.f = Stmp4.f * Sch.f;
256 Stmp1.f = gfour_gamma_squared * Stmp1.f;
257 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
258
259 Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;
260 Ssh.ui = ~Stmp1.ui&Ssh.ui;
261 Ssh.ui = Ssh.ui | Stmp2.ui;
262 Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;
263 Sch.ui = ~Stmp1.ui&Sch.ui;
264 Sch.ui = Sch.ui | Stmp2.ui;
265
266 Stmp1.f = Ssh.f * Ssh.f;
267 Stmp2.f = Sch.f * Sch.f;
268 Sc.f = fsub_rn(Stmp2.f, Stmp1.f);
269 Ss.f = Sch.f*Ssh.f;
270 Ss.f = fadd_rn(Ss.f, Ss.f);
271
272#ifdef DEBUG_JACOBI_CONJUGATE
273 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
274#endif
275
276 //###########################################################
277 // Perform the actual Givens conjugation
278 //###########################################################
279
280 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
281 Ss11.f = Ss11.f * Stmp3.f;
282 Ss21.f = Ss21.f * Stmp3.f;
283 Ss31.f = Ss31.f * Stmp3.f;
284 Ss11.f = Ss11.f * Stmp3.f;
285
286 Stmp1.f = Ss.f*Ss21.f;
287 Stmp2.f = Ss.f*Ss31.f;
288 Ss21.f = Sc.f*Ss21.f;
289 Ss31.f = Sc.f*Ss31.f;
290 Ss21.f = fadd_rn(Stmp2.f, Ss21.f);
291 Ss31.f = fsub_rn(Ss31.f, Stmp1.f);
292
293 Stmp2.f = Ss.f*Ss.f;
294 Stmp1.f = Ss33.f*Stmp2.f;
295 Stmp3.f = Ss22.f*Stmp2.f;
296 Stmp4.f = Sc.f * Sc.f;
297 Ss22.f = Ss22.f * Stmp4.f;
298 Ss33.f = Ss33.f * Stmp4.f;
299 Ss22.f = fadd_rn(Ss22.f, Stmp1.f);
300 Ss33.f = fadd_rn(Ss33.f, Stmp3.f);
301 Stmp4.f = fsub_rn(Stmp4.f, Stmp2.f);
302 Stmp2.f = fadd_rn(Ss32.f, Ss32.f);
303 Ss32.f = Ss32.f*Stmp4.f;
304 Stmp4.f = Sc.f*Ss.f;
305 Stmp2.f = Stmp2.f*Stmp4.f;
306 Stmp5.f = Stmp5.f*Stmp4.f;
307 Ss22.f = fadd_rn(Ss22.f, Stmp2.f);
308 Ss32.f = fsub_rn(Ss32.f, Stmp5.f);
309 Ss33.f = fsub_rn(Ss33.f, Stmp2.f);
310
311#ifdef DEBUG_JACOBI_CONJUGATE
312 printf("%.20g\n", Ss11.f);
313 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
314 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
315#endif
316
317 //###########################################################
318 // Compute the cumulative rotation, in quaternion form
319 //###########################################################
320
321 Stmp1.f = Ssh.f*Sqvvx.f;
322 Stmp2.f = Ssh.f*Sqvvy.f;
323 Stmp3.f = Ssh.f*Sqvvz.f;
324 Ssh.f = Ssh.f*Sqvs.f;
325
326 Sqvs.f = Sch.f*Sqvs.f;
327 Sqvvx.f = Sch.f*Sqvvx.f;
328 Sqvvy.f = Sch.f*Sqvvy.f;
329 Sqvvz.f = Sch.f*Sqvvz.f;
330
331 Sqvvx.f = fadd_rn(Sqvvx.f, Ssh.f);
332 Sqvs.f = fsub_rn(Sqvs.f, Stmp1.f);
333 Sqvvy.f = fadd_rn(Sqvvy.f, Stmp3.f);
334 Sqvvz.f = fsub_rn(Sqvvz.f, Stmp2.f);
335
336#ifdef DEBUG_JACOBI_CONJUGATE
337 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f, Sqvs.f);
338#endif
339#if 1
341 // 1 -> 2
343
344 Ssh.f = Ss31.f * 0.5f;
345 Stmp5.f = fsub_rn(Ss33.f, Ss11.f);
346
347 Stmp2.f = Ssh.f*Ssh.f;
348 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
349 Ssh.ui = Stmp1.ui&Ssh.ui;
350 Sch.ui = Stmp1.ui&Stmp5.ui;
351 Stmp2.ui = ~Stmp1.ui&gone;
352 Sch.ui = Sch.ui | Stmp2.ui;
353
354 Stmp1.f = Ssh.f*Ssh.f;
355 Stmp2.f = Sch.f*Sch.f;
356 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
357 Stmp4.f = frsqrt_rn(Stmp3.f);
358
359 Ssh.f = Stmp4.f*Ssh.f;
360 Sch.f = Stmp4.f*Sch.f;
361 Stmp1.f = gfour_gamma_squared*Stmp1.f;
362 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
363
364 Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;
365 Ssh.ui = ~Stmp1.ui&Ssh.ui;
366 Ssh.ui = Ssh.ui | Stmp2.ui;
367 Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;
368 Sch.ui = ~Stmp1.ui&Sch.ui;
369 Sch.ui = Sch.ui | Stmp2.ui;
370
371 Stmp1.f = Ssh.f*Ssh.f;
372 Stmp2.f = Sch.f*Sch.f;
373 Sc.f = fsub_rn(Stmp2.f, Stmp1.f);
374 Ss.f = Sch.f*Ssh.f;
375 Ss.f = fadd_rn(Ss.f, Ss.f);
376
377#ifdef DEBUG_JACOBI_CONJUGATE
378 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
379#endif
380
381 //###########################################################
382 // Perform the actual Givens conjugation
383 //###########################################################
384
385 Stmp3.f = fadd_rn(Stmp1.f, Stmp2.f);
386 Ss22.f = Ss22.f * Stmp3.f;
387 Ss32.f = Ss32.f * Stmp3.f;
388 Ss21.f = Ss21.f * Stmp3.f;
389 Ss22.f = Ss22.f * Stmp3.f;
390
391 Stmp1.f = Ss.f*Ss32.f;
392 Stmp2.f = Ss.f*Ss21.f;
393 Ss32.f = Sc.f*Ss32.f;
394 Ss21.f = Sc.f*Ss21.f;
395 Ss32.f = fadd_rn(Stmp2.f, Ss32.f);
396 Ss21.f = fsub_rn(Ss21.f, Stmp1.f);
397
398 Stmp2.f = Ss.f*Ss.f;
399 Stmp1.f = Ss11.f*Stmp2.f;
400 Stmp3.f = Ss33.f*Stmp2.f;
401 Stmp4.f = Sc.f*Sc.f;
402 Ss33.f = Ss33.f*Stmp4.f;
403 Ss11.f = Ss11.f*Stmp4.f;
404 Ss33.f = fadd_rn(Ss33.f, Stmp1.f);
405 Ss11.f = fadd_rn(Ss11.f, Stmp3.f);
406 Stmp4.f = fsub_rn(Stmp4.f, Stmp2.f);
407 Stmp2.f = fadd_rn(Ss31.f, Ss31.f);
408 Ss31.f = Ss31.f*Stmp4.f;
409 Stmp4.f = Sc.f*Ss.f;
410 Stmp2.f = Stmp2.f*Stmp4.f;
411 Stmp5.f = Stmp5.f*Stmp4.f;
412 Ss33.f = fadd_rn(Ss33.f, Stmp2.f);
413 Ss31.f = fsub_rn(Ss31.f, Stmp5.f);
414 Ss11.f = fsub_rn(Ss11.f, Stmp2.f);
415
416#ifdef DEBUG_JACOBI_CONJUGATE
417 printf("%.20g\n", Ss11.f);
418 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
419 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
420#endif
421
422 //###########################################################
423 // Compute the cumulative rotation, in quaternion form
424 //###########################################################
425
426 Stmp1.f = Ssh.f*Sqvvx.f;
427 Stmp2.f = Ssh.f*Sqvvy.f;
428 Stmp3.f = Ssh.f*Sqvvz.f;
429 Ssh.f = Ssh.f*Sqvs.f;
430
431 Sqvs.f = Sch.f*Sqvs.f;
432 Sqvvx.f = Sch.f*Sqvvx.f;
433 Sqvvy.f = Sch.f*Sqvvy.f;
434 Sqvvz.f = Sch.f*Sqvvz.f;
435
436 Sqvvy.f = fadd_rn(Sqvvy.f, Ssh.f);
437 Sqvs.f = fsub_rn(Sqvs.f, Stmp2.f);
438 Sqvvz.f = fadd_rn(Sqvvz.f, Stmp1.f);
439 Sqvvx.f = fsub_rn(Sqvvx.f, Stmp3.f);
440#endif
441 }
442
443 //###########################################################
444 // Normalize quaternion for matrix V
445 //###########################################################
446
447 Stmp2.f = Sqvs.f*Sqvs.f;
448 Stmp1.f = Sqvvx.f*Sqvvx.f;
449 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
450 Stmp1.f = Sqvvy.f*Sqvvy.f;
451 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
452 Stmp1.f = Sqvvz.f*Sqvvz.f;
453 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
454
455 Stmp1.f = frsqrt_rn(Stmp2.f);
456 Stmp4.f = Stmp1.f*0.5f;
457 Stmp3.f = Stmp1.f*Stmp4.f;
458 Stmp3.f = Stmp1.f*Stmp3.f;
459 Stmp3.f = Stmp2.f*Stmp3.f;
460 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
461 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
462
463 Sqvs.f = Sqvs.f*Stmp1.f;
464 Sqvvx.f = Sqvvx.f*Stmp1.f;
465 Sqvvy.f = Sqvvy.f*Stmp1.f;
466 Sqvvz.f = Sqvvz.f*Stmp1.f;
467
468 //###########################################################
469 // Transform quaternion to matrix V
470 //###########################################################
471
472 Stmp1.f = Sqvvx.f*Sqvvx.f;
473 Stmp2.f = Sqvvy.f*Sqvvy.f;
474 Stmp3.f = Sqvvz.f*Sqvvz.f;
475 Sv11.f = Sqvs.f*Sqvs.f;
476 Sv22.f = fsub_rn(Sv11.f, Stmp1.f);
477 Sv33.f = fsub_rn(Sv22.f, Stmp2.f);
478 Sv33.f = fadd_rn(Sv33.f, Stmp3.f);
479 Sv22.f = fadd_rn(Sv22.f, Stmp2.f);
480 Sv22.f = fsub_rn(Sv22.f, Stmp3.f);
481 Sv11.f = fadd_rn(Sv11.f, Stmp1.f);
482 Sv11.f = fsub_rn(Sv11.f, Stmp2.f);
483 Sv11.f = fsub_rn(Sv11.f, Stmp3.f);
484 Stmp1.f = fadd_rn(Sqvvx.f, Sqvvx.f);
485 Stmp2.f = fadd_rn(Sqvvy.f, Sqvvy.f);
486 Stmp3.f = fadd_rn(Sqvvz.f, Sqvvz.f);
487 Sv32.f = Sqvs.f*Stmp1.f;
488 Sv13.f = Sqvs.f*Stmp2.f;
489 Sv21.f = Sqvs.f*Stmp3.f;
490 Stmp1.f = Sqvvy.f*Stmp1.f;
491 Stmp2.f = Sqvvz.f*Stmp2.f;
492 Stmp3.f = Sqvvx.f*Stmp3.f;
493 Sv12.f = fsub_rn(Stmp1.f, Sv21.f);
494 Sv23.f = fsub_rn(Stmp2.f, Sv32.f);
495 Sv31.f = fsub_rn(Stmp3.f, Sv13.f);
496 Sv21.f = fadd_rn(Stmp1.f, Sv21.f);
497 Sv32.f = fadd_rn(Stmp2.f, Sv32.f);
498 Sv13.f = fadd_rn(Stmp3.f, Sv13.f);
499
501 // Multiply (from the right) with V
502 //###########################################################
503
504 Stmp2.f = Sa12.f;
505 Stmp3.f = Sa13.f;
506 Sa12.f = Sv12.f*Sa11.f;
507 Sa13.f = Sv13.f*Sa11.f;
508 Sa11.f = Sv11.f*Sa11.f;
509 Stmp1.f = Sv21.f*Stmp2.f;
510 Sa11.f = fadd_rn(Sa11.f, Stmp1.f);
511 Stmp1.f = Sv31.f*Stmp3.f;
512 Sa11.f = fadd_rn(Sa11.f, Stmp1.f);
513 Stmp1.f = Sv22.f*Stmp2.f;
514 Sa12.f = fadd_rn(Sa12.f, Stmp1.f);
515 Stmp1.f = Sv32.f*Stmp3.f;
516 Sa12.f = fadd_rn(Sa12.f, Stmp1.f);
517 Stmp1.f = Sv23.f*Stmp2.f;
518 Sa13.f = fadd_rn(Sa13.f, Stmp1.f);
519 Stmp1.f = Sv33.f*Stmp3.f;
520 Sa13.f = fadd_rn(Sa13.f, Stmp1.f);
521
522 Stmp2.f = Sa22.f;
523 Stmp3.f = Sa23.f;
524 Sa22.f = Sv12.f*Sa21.f;
525 Sa23.f = Sv13.f*Sa21.f;
526 Sa21.f = Sv11.f*Sa21.f;
527 Stmp1.f = Sv21.f*Stmp2.f;
528 Sa21.f = fadd_rn(Sa21.f, Stmp1.f);
529 Stmp1.f = Sv31.f*Stmp3.f;
530 Sa21.f = fadd_rn(Sa21.f, Stmp1.f);
531 Stmp1.f = Sv22.f*Stmp2.f;
532 Sa22.f = fadd_rn(Sa22.f, Stmp1.f);
533 Stmp1.f = Sv32.f*Stmp3.f;
534 Sa22.f = fadd_rn(Sa22.f, Stmp1.f);
535 Stmp1.f = Sv23.f*Stmp2.f;
536 Sa23.f = fadd_rn(Sa23.f, Stmp1.f);
537 Stmp1.f = Sv33.f*Stmp3.f;
538 Sa23.f = fadd_rn(Sa23.f, Stmp1.f);
539
540 Stmp2.f = Sa32.f;
541 Stmp3.f = Sa33.f;
542 Sa32.f = Sv12.f*Sa31.f;
543 Sa33.f = Sv13.f*Sa31.f;
544 Sa31.f = Sv11.f*Sa31.f;
545 Stmp1.f = Sv21.f*Stmp2.f;
546 Sa31.f = fadd_rn(Sa31.f, Stmp1.f);
547 Stmp1.f = Sv31.f*Stmp3.f;
548 Sa31.f = fadd_rn(Sa31.f, Stmp1.f);
549 Stmp1.f = Sv22.f*Stmp2.f;
550 Sa32.f = fadd_rn(Sa32.f, Stmp1.f);
551 Stmp1.f = Sv32.f*Stmp3.f;
552 Sa32.f = fadd_rn(Sa32.f, Stmp1.f);
553 Stmp1.f = Sv23.f*Stmp2.f;
554 Sa33.f = fadd_rn(Sa33.f, Stmp1.f);
555 Stmp1.f = Sv33.f*Stmp3.f;
556 Sa33.f = fadd_rn(Sa33.f, Stmp1.f);
557
558 //###########################################################
559 // Permute columns such that the singular values are sorted
560 //###########################################################
561
562 Stmp1.f = Sa11.f*Sa11.f;
563 Stmp4.f = Sa21.f*Sa21.f;
564 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
565 Stmp4.f = Sa31.f*Sa31.f;
566 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
567
568 Stmp2.f = Sa12.f*Sa12.f;
569 Stmp4.f = Sa22.f*Sa22.f;
570 Stmp2.f = fadd_rn(Stmp2.f, Stmp4.f);
571 Stmp4.f = Sa32.f*Sa32.f;
572 Stmp2.f = fadd_rn(Stmp2.f, Stmp4.f);
573
574 Stmp3.f = Sa13.f*Sa13.f;
575 Stmp4.f = Sa23.f*Sa23.f;
576 Stmp3.f = fadd_rn(Stmp3.f, Stmp4.f);
577 Stmp4.f = Sa33.f*Sa33.f;
578 Stmp3.f = fadd_rn(Stmp3.f, Stmp4.f);
579
580 // Swap columns 1-2 if necessary
581 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
582 Stmp5.ui = Sa11.ui^Sa12.ui;
583 Stmp5.ui = Stmp5.ui&Stmp4.ui;
584 Sa11.ui = Sa11.ui^Stmp5.ui;
585 Sa12.ui = Sa12.ui^Stmp5.ui;
586
587 Stmp5.ui = Sa21.ui^Sa22.ui;
588 Stmp5.ui = Stmp5.ui&Stmp4.ui;
589 Sa21.ui = Sa21.ui^Stmp5.ui;
590 Sa22.ui = Sa22.ui^Stmp5.ui;
591
592 Stmp5.ui = Sa31.ui^Sa32.ui;
593 Stmp5.ui = Stmp5.ui&Stmp4.ui;
594 Sa31.ui = Sa31.ui^Stmp5.ui;
595 Sa32.ui = Sa32.ui^Stmp5.ui;
596
597 Stmp5.ui = Sv11.ui^Sv12.ui;
598 Stmp5.ui = Stmp5.ui&Stmp4.ui;
599 Sv11.ui = Sv11.ui^Stmp5.ui;
600 Sv12.ui = Sv12.ui^Stmp5.ui;
601
602 Stmp5.ui = Sv21.ui^Sv22.ui;
603 Stmp5.ui = Stmp5.ui&Stmp4.ui;
604 Sv21.ui = Sv21.ui^Stmp5.ui;
605 Sv22.ui = Sv22.ui^Stmp5.ui;
606
607 Stmp5.ui = Sv31.ui^Sv32.ui;
608 Stmp5.ui = Stmp5.ui&Stmp4.ui;
609 Sv31.ui = Sv31.ui^Stmp5.ui;
610 Sv32.ui = Sv32.ui^Stmp5.ui;
611
612 Stmp5.ui = Stmp1.ui^Stmp2.ui;
613 Stmp5.ui = Stmp5.ui&Stmp4.ui;
614 Stmp1.ui = Stmp1.ui^Stmp5.ui;
615 Stmp2.ui = Stmp2.ui^Stmp5.ui;
616
617 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation
618
619 Stmp5.f = -2.f;
620 Stmp5.ui = Stmp5.ui&Stmp4.ui;
621 Stmp4.f = 1.f;
622 Stmp4.f = fadd_rn(Stmp4.f, Stmp5.f);
623
624 Sa12.f = Sa12.f*Stmp4.f;
625 Sa22.f = Sa22.f*Stmp4.f;
626 Sa32.f = Sa32.f*Stmp4.f;
627
628 Sv12.f = Sv12.f*Stmp4.f;
629 Sv22.f = Sv22.f*Stmp4.f;
630 Sv32.f = Sv32.f*Stmp4.f;
631
632 // Swap columns 1-3 if necessary
633
634 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
635 Stmp5.ui = Sa11.ui^Sa13.ui;
636 Stmp5.ui = Stmp5.ui&Stmp4.ui;
637 Sa11.ui = Sa11.ui^Stmp5.ui;
638 Sa13.ui = Sa13.ui^Stmp5.ui;
639
640 Stmp5.ui = Sa21.ui^Sa23.ui;
641 Stmp5.ui = Stmp5.ui&Stmp4.ui;
642 Sa21.ui = Sa21.ui^Stmp5.ui;
643 Sa23.ui = Sa23.ui^Stmp5.ui;
644
645 Stmp5.ui = Sa31.ui^Sa33.ui;
646 Stmp5.ui = Stmp5.ui&Stmp4.ui;
647 Sa31.ui = Sa31.ui^Stmp5.ui;
648 Sa33.ui = Sa33.ui^Stmp5.ui;
649
650 Stmp5.ui = Sv11.ui^Sv13.ui;
651 Stmp5.ui = Stmp5.ui&Stmp4.ui;
652 Sv11.ui = Sv11.ui^Stmp5.ui;
653 Sv13.ui = Sv13.ui^Stmp5.ui;
654
655 Stmp5.ui = Sv21.ui^Sv23.ui;
656 Stmp5.ui = Stmp5.ui&Stmp4.ui;
657 Sv21.ui = Sv21.ui^Stmp5.ui;
658 Sv23.ui = Sv23.ui^Stmp5.ui;
659
660 Stmp5.ui = Sv31.ui^Sv33.ui;
661 Stmp5.ui = Stmp5.ui&Stmp4.ui;
662 Sv31.ui = Sv31.ui^Stmp5.ui;
663 Sv33.ui = Sv33.ui^Stmp5.ui;
664
665 Stmp5.ui = Stmp1.ui^Stmp3.ui;
666 Stmp5.ui = Stmp5.ui&Stmp4.ui;
667 Stmp1.ui = Stmp1.ui^Stmp5.ui;
668 Stmp3.ui = Stmp3.ui^Stmp5.ui;
669
670 // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation
671
672 Stmp5.f = -2.f;
673 Stmp5.ui = Stmp5.ui&Stmp4.ui;
674 Stmp4.f = 1.f;
675 Stmp4.f = fadd_rn(Stmp4.f, Stmp5.f);
676
677 Sa11.f = Sa11.f*Stmp4.f;
678 Sa21.f = Sa21.f*Stmp4.f;
679 Sa31.f = Sa31.f*Stmp4.f;
680
681 Sv11.f = Sv11.f*Stmp4.f;
682 Sv21.f = Sv21.f*Stmp4.f;
683 Sv31.f = Sv31.f*Stmp4.f;
684
685 // Swap columns 2-3 if necessary
686
687 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
688 Stmp5.ui = Sa12.ui^Sa13.ui;
689 Stmp5.ui = Stmp5.ui&Stmp4.ui;
690 Sa12.ui = Sa12.ui^Stmp5.ui;
691 Sa13.ui = Sa13.ui^Stmp5.ui;
692
693 Stmp5.ui = Sa22.ui^Sa23.ui;
694 Stmp5.ui = Stmp5.ui&Stmp4.ui;
695 Sa22.ui = Sa22.ui^Stmp5.ui;
696 Sa23.ui = Sa23.ui^Stmp5.ui;
697
698 Stmp5.ui = Sa32.ui^Sa33.ui;
699 Stmp5.ui = Stmp5.ui&Stmp4.ui;
700 Sa32.ui = Sa32.ui^Stmp5.ui;
701 Sa33.ui = Sa33.ui^Stmp5.ui;
702
703 Stmp5.ui = Sv12.ui^Sv13.ui;
704 Stmp5.ui = Stmp5.ui&Stmp4.ui;
705 Sv12.ui = Sv12.ui^Stmp5.ui;
706 Sv13.ui = Sv13.ui^Stmp5.ui;
707
708 Stmp5.ui = Sv22.ui^Sv23.ui;
709 Stmp5.ui = Stmp5.ui&Stmp4.ui;
710 Sv22.ui = Sv22.ui^Stmp5.ui;
711 Sv23.ui = Sv23.ui^Stmp5.ui;
712
713 Stmp5.ui = Sv32.ui^Sv33.ui;
714 Stmp5.ui = Stmp5.ui&Stmp4.ui;
715 Sv32.ui = Sv32.ui^Stmp5.ui;
716 Sv33.ui = Sv33.ui^Stmp5.ui;
717
718 Stmp5.ui = Stmp2.ui^Stmp3.ui;
719 Stmp5.ui = Stmp5.ui&Stmp4.ui;
720 Stmp2.ui = Stmp2.ui^Stmp5.ui;
721 Stmp3.ui = Stmp3.ui^Stmp5.ui;
722
723 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation
724
725 Stmp5.f = -2.f;
726 Stmp5.ui = Stmp5.ui&Stmp4.ui;
727 Stmp4.f = 1.f;
728 Stmp4.f = fadd_rn(Stmp4.f, Stmp5.f);
729
730 Sa13.f = Sa13.f*Stmp4.f;
731 Sa23.f = Sa23.f*Stmp4.f;
732 Sa33.f = Sa33.f*Stmp4.f;
733
734 Sv13.f = Sv13.f*Stmp4.f;
735 Sv23.f = Sv23.f*Stmp4.f;
736 Sv33.f = Sv33.f*Stmp4.f;
737
738 //###########################################################
739 // Construct QR factorization of A*V (=U*D) using Givens rotations
740 //###########################################################
741
742 Su11.f = 1.f; Su12.f = 0.f; Su13.f = 0.f;
743 Su21.f = 0.f; Su22.f = 1.f; Su23.f = 0.f;
744 Su31.f = 0.f; Su32.f = 0.f; Su33.f = 1.f;
745
746 Ssh.f = Sa21.f*Sa21.f;
747 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
748 Ssh.ui = Ssh.ui&Sa21.ui;
749
750 Stmp5.f = 0.f;
751 Sch.f = fsub_rn(Stmp5.f, Sa11.f);
752 Sch.f = max(Sch.f, Sa11.f);
753 Sch.f = max(Sch.f, gsmall_number);
754 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
755
756 Stmp1.f = Sch.f*Sch.f;
757 Stmp2.f = Ssh.f*Ssh.f;
758 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
759 Stmp1.f = frsqrt_rn(Stmp2.f);
760
761 Stmp4.f = Stmp1.f*0.5f;
762 Stmp3.f = Stmp1.f*Stmp4.f;
763 Stmp3.f = Stmp1.f*Stmp3.f;
764 Stmp3.f = Stmp2.f*Stmp3.f;
765 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
766 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
767 Stmp1.f = Stmp1.f*Stmp2.f;
768
769 Sch.f = fadd_rn(Sch.f, Stmp1.f);
770
771 Stmp1.ui = ~Stmp5.ui&Ssh.ui;
772 Stmp2.ui = ~Stmp5.ui&Sch.ui;
773 Sch.ui = Stmp5.ui&Sch.ui;
774 Ssh.ui = Stmp5.ui&Ssh.ui;
775 Sch.ui = Sch.ui | Stmp1.ui;
776 Ssh.ui = Ssh.ui | Stmp2.ui;
777
778 Stmp1.f = Sch.f*Sch.f;
779 Stmp2.f = Ssh.f*Ssh.f;
780 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
781 Stmp1.f = frsqrt_rn(Stmp2.f);
782
783 Stmp4.f = Stmp1.f*0.5f;
784 Stmp3.f = Stmp1.f*Stmp4.f;
785 Stmp3.f = Stmp1.f*Stmp3.f;
786 Stmp3.f = Stmp2.f*Stmp3.f;
787 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
788 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
789
790 Sch.f = Sch.f*Stmp1.f;
791 Ssh.f = Ssh.f*Stmp1.f;
792
793 Sc.f = Sch.f*Sch.f;
794 Ss.f = Ssh.f*Ssh.f;
795 Sc.f = fsub_rn(Sc.f, Ss.f);
796 Ss.f = Ssh.f*Sch.f;
797 Ss.f = fadd_rn(Ss.f, Ss.f);
798
799 //###########################################################
800 // Rotate matrix A
801 //###########################################################
802
803 Stmp1.f = Ss.f*Sa11.f;
804 Stmp2.f = Ss.f*Sa21.f;
805 Sa11.f = Sc.f*Sa11.f;
806 Sa21.f = Sc.f*Sa21.f;
807 Sa11.f = fadd_rn(Sa11.f, Stmp2.f);
808 Sa21.f = fsub_rn(Sa21.f, Stmp1.f);
809
810 Stmp1.f = Ss.f*Sa12.f;
811 Stmp2.f = Ss.f*Sa22.f;
812 Sa12.f = Sc.f*Sa12.f;
813 Sa22.f = Sc.f*Sa22.f;
814 Sa12.f = fadd_rn(Sa12.f, Stmp2.f);
815 Sa22.f = fsub_rn(Sa22.f, Stmp1.f);
816
817 Stmp1.f = Ss.f*Sa13.f;
818 Stmp2.f = Ss.f*Sa23.f;
819 Sa13.f = Sc.f*Sa13.f;
820 Sa23.f = Sc.f*Sa23.f;
821 Sa13.f = fadd_rn(Sa13.f, Stmp2.f);
822 Sa23.f = fsub_rn(Sa23.f, Stmp1.f);
823
824 //###########################################################
825 // Update matrix U
826 //###########################################################
827
828 Stmp1.f = Ss.f*Su11.f;
829 Stmp2.f = Ss.f*Su12.f;
830 Su11.f = Sc.f*Su11.f;
831 Su12.f = Sc.f*Su12.f;
832 Su11.f = fadd_rn(Su11.f, Stmp2.f);
833 Su12.f = fsub_rn(Su12.f, Stmp1.f);
834
835 Stmp1.f = Ss.f*Su21.f;
836 Stmp2.f = Ss.f*Su22.f;
837 Su21.f = Sc.f*Su21.f;
838 Su22.f = Sc.f*Su22.f;
839 Su21.f = fadd_rn(Su21.f, Stmp2.f);
840 Su22.f = fsub_rn(Su22.f, Stmp1.f);
841
842 Stmp1.f = Ss.f*Su31.f;
843 Stmp2.f = Ss.f*Su32.f;
844 Su31.f = Sc.f*Su31.f;
845 Su32.f = Sc.f*Su32.f;
846 Su31.f = fadd_rn(Su31.f, Stmp2.f);
847 Su32.f = fsub_rn(Su32.f, Stmp1.f);
848
849 // Second Givens rotation
850
851 Ssh.f = Sa31.f*Sa31.f;
852 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
853 Ssh.ui = Ssh.ui&Sa31.ui;
854
855 Stmp5.f = 0.f;
856 Sch.f = fsub_rn(Stmp5.f, Sa11.f);
857 Sch.f = max(Sch.f, Sa11.f);
858 Sch.f = max(Sch.f, gsmall_number);
859 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
860
861 Stmp1.f = Sch.f*Sch.f;
862 Stmp2.f = Ssh.f*Ssh.f;
863 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
864 Stmp1.f = frsqrt_rn(Stmp2.f);
865
866 Stmp4.f = Stmp1.f*0.5;
867 Stmp3.f = Stmp1.f*Stmp4.f;
868 Stmp3.f = Stmp1.f*Stmp3.f;
869 Stmp3.f = Stmp2.f*Stmp3.f;
870 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
871 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
872 Stmp1.f = Stmp1.f*Stmp2.f;
873
874 Sch.f = fadd_rn(Sch.f, Stmp1.f);
875
876 Stmp1.ui = ~Stmp5.ui&Ssh.ui;
877 Stmp2.ui = ~Stmp5.ui&Sch.ui;
878 Sch.ui = Stmp5.ui&Sch.ui;
879 Ssh.ui = Stmp5.ui&Ssh.ui;
880 Sch.ui = Sch.ui | Stmp1.ui;
881 Ssh.ui = Ssh.ui | Stmp2.ui;
882
883 Stmp1.f = Sch.f*Sch.f;
884 Stmp2.f = Ssh.f*Ssh.f;
885 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
886 Stmp1.f = frsqrt_rn(Stmp2.f);
887
888 Stmp4.f = Stmp1.f*0.5f;
889 Stmp3.f = Stmp1.f*Stmp4.f;
890 Stmp3.f = Stmp1.f*Stmp3.f;
891 Stmp3.f = Stmp2.f*Stmp3.f;
892 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
893 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
894
895 Sch.f = Sch.f*Stmp1.f;
896 Ssh.f = Ssh.f*Stmp1.f;
897
898 Sc.f = Sch.f*Sch.f;
899 Ss.f = Ssh.f*Ssh.f;
900 Sc.f = fsub_rn(Sc.f, Ss.f);
901 Ss.f = Ssh.f*Sch.f;
902 Ss.f = fadd_rn(Ss.f, Ss.f);
903
904 //###########################################################
905 // Rotate matrix A
906 //###########################################################
907
908 Stmp1.f = Ss.f*Sa11.f;
909 Stmp2.f = Ss.f*Sa31.f;
910 Sa11.f = Sc.f*Sa11.f;
911 Sa31.f = Sc.f*Sa31.f;
912 Sa11.f = fadd_rn(Sa11.f, Stmp2.f);
913 Sa31.f = fsub_rn(Sa31.f, Stmp1.f);
914
915 Stmp1.f = Ss.f*Sa12.f;
916 Stmp2.f = Ss.f*Sa32.f;
917 Sa12.f = Sc.f*Sa12.f;
918 Sa32.f = Sc.f*Sa32.f;
919 Sa12.f = fadd_rn(Sa12.f, Stmp2.f);
920 Sa32.f = fsub_rn(Sa32.f, Stmp1.f);
921
922 Stmp1.f = Ss.f*Sa13.f;
923 Stmp2.f = Ss.f*Sa33.f;
924 Sa13.f = Sc.f*Sa13.f;
925 Sa33.f = Sc.f*Sa33.f;
926 Sa13.f = fadd_rn(Sa13.f, Stmp2.f);
927 Sa33.f = fsub_rn(Sa33.f, Stmp1.f);
928
929 //###########################################################
930 // Update matrix U
931 //###########################################################
932
933 Stmp1.f = Ss.f*Su11.f;
934 Stmp2.f = Ss.f*Su13.f;
935 Su11.f = Sc.f*Su11.f;
936 Su13.f = Sc.f*Su13.f;
937 Su11.f = fadd_rn(Su11.f, Stmp2.f);
938 Su13.f = fsub_rn(Su13.f, Stmp1.f);
939
940 Stmp1.f = Ss.f*Su21.f;
941 Stmp2.f = Ss.f*Su23.f;
942 Su21.f = Sc.f*Su21.f;
943 Su23.f = Sc.f*Su23.f;
944 Su21.f = fadd_rn(Su21.f, Stmp2.f);
945 Su23.f = fsub_rn(Su23.f, Stmp1.f);
946
947 Stmp1.f = Ss.f*Su31.f;
948 Stmp2.f = Ss.f*Su33.f;
949 Su31.f = Sc.f*Su31.f;
950 Su33.f = Sc.f*Su33.f;
951 Su31.f = fadd_rn(Su31.f, Stmp2.f);
952 Su33.f = fsub_rn(Su33.f, Stmp1.f);
953
954 // Third Givens Rotation
955
956 Ssh.f = Sa32.f*Sa32.f;
957 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
958 Ssh.ui = Ssh.ui&Sa32.ui;
959
960 Stmp5.f = 0.f;
961 Sch.f = fsub_rn(Stmp5.f, Sa22.f);
962 Sch.f = max(Sch.f, Sa22.f);
963 Sch.f = max(Sch.f, gsmall_number);
964 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
965
966 Stmp1.f = Sch.f*Sch.f;
967 Stmp2.f = Ssh.f*Ssh.f;
968 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
969 Stmp1.f = frsqrt_rn(Stmp2.f);
970
971 Stmp4.f = Stmp1.f*0.5f;
972 Stmp3.f = Stmp1.f*Stmp4.f;
973 Stmp3.f = Stmp1.f*Stmp3.f;
974 Stmp3.f = Stmp2.f*Stmp3.f;
975 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
976 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
977 Stmp1.f = Stmp1.f*Stmp2.f;
978
979 Sch.f = fadd_rn(Sch.f, Stmp1.f);
980
981 Stmp1.ui = ~Stmp5.ui&Ssh.ui;
982 Stmp2.ui = ~Stmp5.ui&Sch.ui;
983 Sch.ui = Stmp5.ui&Sch.ui;
984 Ssh.ui = Stmp5.ui&Ssh.ui;
985 Sch.ui = Sch.ui | Stmp1.ui;
986 Ssh.ui = Ssh.ui | Stmp2.ui;
987
988 Stmp1.f = Sch.f*Sch.f;
989 Stmp2.f = Ssh.f*Ssh.f;
990 Stmp2.f = fadd_rn(Stmp1.f, Stmp2.f);
991 Stmp1.f = frsqrt_rn(Stmp2.f);
992
993 Stmp4.f = Stmp1.f*0.5f;
994 Stmp3.f = Stmp1.f*Stmp4.f;
995 Stmp3.f = Stmp1.f*Stmp3.f;
996 Stmp3.f = Stmp2.f*Stmp3.f;
997 Stmp1.f = fadd_rn(Stmp1.f, Stmp4.f);
998 Stmp1.f = fsub_rn(Stmp1.f, Stmp3.f);
999
1000 Sch.f = Sch.f*Stmp1.f;
1001 Ssh.f = Ssh.f*Stmp1.f;
1002
1003 Sc.f = Sch.f*Sch.f;
1004 Ss.f = Ssh.f*Ssh.f;
1005 Sc.f = fsub_rn(Sc.f, Ss.f);
1006 Ss.f = Ssh.f*Sch.f;
1007 Ss.f = fadd_rn(Ss.f, Ss.f);
1008
1009 //###########################################################
1010 // Rotate matrix A
1011 //###########################################################
1012
1013 Stmp1.f = Ss.f*Sa21.f;
1014 Stmp2.f = Ss.f*Sa31.f;
1015 Sa21.f = Sc.f*Sa21.f;
1016 Sa31.f = Sc.f*Sa31.f;
1017 Sa21.f = fadd_rn(Sa21.f, Stmp2.f);
1018 Sa31.f = fsub_rn(Sa31.f, Stmp1.f);
1019
1020 Stmp1.f = Ss.f*Sa22.f;
1021 Stmp2.f = Ss.f*Sa32.f;
1022 Sa22.f = Sc.f*Sa22.f;
1023 Sa32.f = Sc.f*Sa32.f;
1024 Sa22.f = fadd_rn(Sa22.f, Stmp2.f);
1025 Sa32.f = fsub_rn(Sa32.f, Stmp1.f);
1026
1027 Stmp1.f = Ss.f*Sa23.f;
1028 Stmp2.f = Ss.f*Sa33.f;
1029 Sa23.f = Sc.f*Sa23.f;
1030 Sa33.f = Sc.f*Sa33.f;
1031 Sa23.f = fadd_rn(Sa23.f, Stmp2.f);
1032 Sa33.f = fsub_rn(Sa33.f, Stmp1.f);
1033
1034 //###########################################################
1035 // Update matrix U
1036 //###########################################################
1037
1038 Stmp1.f = Ss.f*Su12.f;
1039 Stmp2.f = Ss.f*Su13.f;
1040 Su12.f = Sc.f*Su12.f;
1041 Su13.f = Sc.f*Su13.f;
1042 Su12.f = fadd_rn(Su12.f, Stmp2.f);
1043 Su13.f = fsub_rn(Su13.f, Stmp1.f);
1044
1045 Stmp1.f = Ss.f*Su22.f;
1046 Stmp2.f = Ss.f*Su23.f;
1047 Su22.f = Sc.f*Su22.f;
1048 Su23.f = Sc.f*Su23.f;
1049 Su22.f = fadd_rn(Su22.f, Stmp2.f);
1050 Su23.f = fsub_rn(Su23.f, Stmp1.f);
1051
1052 Stmp1.f = Ss.f*Su32.f;
1053 Stmp2.f = Ss.f*Su33.f;
1054 Su32.f = Sc.f*Su32.f;
1055 Su33.f = Sc.f*Su33.f;
1056 Su32.f = fadd_rn(Su32.f, Stmp2.f);
1057 Su33.f = fsub_rn(Su33.f, Stmp1.f);
1058
1059 v11 = Sv11.f; v12 = Sv12.f; v13 = Sv13.f;
1060 v21 = Sv21.f; v22 = Sv22.f; v23 = Sv23.f;
1061 v31 = Sv31.f; v32 = Sv32.f; v33 = Sv33.f;
1062
1063 u11 = Su11.f; u12 = Su12.f; u13 = Su13.f;
1064 u21 = Su21.f; u22 = Su22.f; u23 = Su23.f;
1065 u31 = Su31.f; u32 = Su32.f; u33 = Su33.f;
1066
1067 s11 = Sa11.f;
1068 //s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1069 s22 = Sa22.f;
1070 //s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1071 s33 = Sa33.f;
1072}
1073#endif
#define frsqrt_rn(x)
Definition svd3_cuda.h:44
#define gone
Definition svd3_cuda.h:30
#define gsmall_number
Definition svd3_cuda.h:34
#define gcosine_pi_over_eight
Definition svd3_cuda.h:32
__host__ __forceinline__ void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)
Definition svd3_cuda.h:54
#define gsine_pi_over_eight
Definition svd3_cuda.h:31
#define fadd_rn(x, y)
Definition svd3_cuda.h:42
#define gfour_gamma_squared
Definition svd3_cuda.h:36
#define max(x, y)
Definition svd3_cuda.h:41
#define fsub_rn(x, y)
Definition svd3_cuda.h:43
#define gtiny_number
Definition svd3_cuda.h:35
Definition svd3_cuda.h:38
unsigned int ui
Definition svd3_cuda.h:38
float f
Definition svd3_cuda.h:38