00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027
00028 #define VMDQCPUSESSE 1
00029
00030 #if defined(VMDUSEAVX512)
00031 #define VMDQCPUSEAVX512 1
00032 #endif
00033
00034 #define VMDQCPUSETHRPOOL 1
00035
00036 #if VMDQCPUSESSE && defined(__SSE2__)
00037 #include <emmintrin.h>
00038 #endif
00039 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00040 #include <immintrin.h>
00041 #endif
00042 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00043 #include <immintrin.h>
00044 #endif
00045 #if (defined(VMDQCPUSEVSX) && defined(__VSX__))
00046 #if defined(__GNUC__) && defined(__VEC__)
00047 #include <altivec.h>
00048 #endif
00049 #endif
00050
00051 #include <math.h>
00052 #include "Measure.h"
00053 #include "AtomSel.h"
00054 #include "utilities.h"
00055 #include "ResizeArray.h"
00056 #include "MoleculeList.h"
00057 #include "Inform.h"
00058 #include "Timestep.h"
00059 #include "CUDAAccel.h"
00060 #include "CUDAMeasureQCP.h"
00061 #include "VMDApp.h"
00062 #include "WKFThreads.h"
00063 #include "WKFUtils.h"
00064
00065 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00066
00067 static double hadd8_m512d(__m512d sum8) {
00068
00069
00070
00071
00072
00073 return 0.0;
00074 }
00075
00076
00077
00078 static double InnerProductSOA_avx512(double *A,
00079 float *crdx1, float *crdy1, float *crdz1,
00080 float *crdx2, float *crdy2, float *crdz2,
00081 const int cnt, const float *weight) {
00082 __m512d va0 = _mm512_set1_pd(0.0);
00083 __m512d va1 = _mm512_set1_pd(0.0);
00084 __m512d va2 = _mm512_set1_pd(0.0);
00085 __m512d va3 = _mm512_set1_pd(0.0);
00086 __m512d va4 = _mm512_set1_pd(0.0);
00087 __m512d va5 = _mm512_set1_pd(0.0);
00088 __m512d va6 = _mm512_set1_pd(0.0);
00089 __m512d va7 = _mm512_set1_pd(0.0);
00090 __m512d va8 = _mm512_set1_pd(0.0);
00091 __m512d vG1 = _mm512_set1_pd(0.0);
00092 __m512d vG2 = _mm512_set1_pd(0.0);
00093
00094 if (weight != NULL) {
00095 for (int i=0; i<cnt; i+=8) {
00096 __m256 xa8f = _mm256_load_ps(crdx1 + i);
00097 __m256 ya8f = _mm256_load_ps(crdy1 + i);
00098 __m256 za8f = _mm256_load_ps(crdz1 + i);
00099
00100 __m512d xa8 = _mm512_cvtps_pd(xa8f);
00101 __m512d ya8 = _mm512_cvtps_pd(ya8f);
00102 __m512d za8 = _mm512_cvtps_pd(za8f);
00103
00104 __m512d gatmp = _mm512_mul_pd(xa8, xa8);
00105 gatmp = _mm512_fmadd_pd(ya8, ya8, gatmp);
00106 gatmp = _mm512_fmadd_pd(za8, za8, gatmp);
00107
00108 __m256 xb8f = _mm256_load_ps(crdx2 + i);
00109 __m256 yb8f = _mm256_load_ps(crdy2 + i);
00110 __m256 zb8f = _mm256_load_ps(crdz2 + i);
00111
00112 __m512d xb8 = _mm512_cvtps_pd(xb8f);
00113 __m512d yb8 = _mm512_cvtps_pd(yb8f);
00114 __m512d zb8 = _mm512_cvtps_pd(zb8f);
00115
00116 __m512d gbtmp = _mm512_mul_pd(xb8, xb8);
00117 gbtmp = _mm512_fmadd_pd(yb8, yb8, gbtmp);
00118 gbtmp = _mm512_fmadd_pd(zb8, zb8, gbtmp);
00119
00120 __m256 w8f = _mm256_load_ps(weight + i);
00121 __m512d w8 = _mm512_cvtps_pd(w8f);
00122
00123 vG1 = _mm512_fmadd_pd(w8, gatmp, vG1);
00124 vG2 = _mm512_fmadd_pd(w8, gbtmp, vG2);
00125
00126 va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00127 va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00128 va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00129
00130 va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00131 va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00132 va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00133
00134 va6 = _mm512_fmadd_pd(za8, xb8, va6);
00135 va7 = _mm512_fmadd_pd(za8, yb8, va7);
00136 va8 = _mm512_fmadd_pd(za8, zb8, va8);
00137 }
00138 } else {
00139 for (int i=0; i<cnt; i+=8) {
00140 __m256 xa8f = _mm256_load_ps(crdx1 + i);
00141 __m256 ya8f = _mm256_load_ps(crdy1 + i);
00142 __m256 za8f = _mm256_load_ps(crdz1 + i);
00143
00144 __m512d xa8 = _mm512_cvtps_pd(xa8f);
00145 __m512d ya8 = _mm512_cvtps_pd(ya8f);
00146 __m512d za8 = _mm512_cvtps_pd(za8f);
00147
00148 vG1 = _mm512_fmadd_pd(xa8, xa8, vG1);
00149 vG1 = _mm512_fmadd_pd(ya8, ya8, vG1);
00150 vG1 = _mm512_fmadd_pd(za8, za8, vG1);
00151
00152 __m256 xb8f = _mm256_load_ps(crdx2 + i);
00153 __m256 yb8f = _mm256_load_ps(crdy2 + i);
00154 __m256 zb8f = _mm256_load_ps(crdz2 + i);
00155
00156 __m512d xb8 = _mm512_cvtps_pd(xb8f);
00157 __m512d yb8 = _mm512_cvtps_pd(yb8f);
00158 __m512d zb8 = _mm512_cvtps_pd(zb8f);
00159
00160 vG2 = _mm512_fmadd_pd(xb8, xb8, vG2);
00161 vG2 = _mm512_fmadd_pd(yb8, yb8, vG2);
00162 vG2 = _mm512_fmadd_pd(zb8, zb8, vG2);
00163
00164 va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00165 va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00166 va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00167
00168 va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00169 va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00170 va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00171
00172 va6 = _mm512_fmadd_pd(za8, xb8, va6);
00173 va7 = _mm512_fmadd_pd(za8, yb8, va7);
00174 va8 = _mm512_fmadd_pd(za8, zb8, va8);
00175 }
00176 }
00177
00178 A[0] = hadd8_m512d(va0);
00179 A[1] = hadd8_m512d(va1);
00180 A[2] = hadd8_m512d(va2);
00181 A[3] = hadd8_m512d(va3);
00182 A[4] = hadd8_m512d(va4);
00183 A[5] = hadd8_m512d(va5);
00184 A[6] = hadd8_m512d(va6);
00185 A[7] = hadd8_m512d(va7);
00186 A[8] = hadd8_m512d(va8);
00187
00188 double G1 = hadd8_m512d(vG1);
00189 double G2 = hadd8_m512d(vG2);
00190
00191 return (G1 + G2) * 0.5;
00192 }
00193
00194 #endif
00195
00196 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00197
00198 static double hadd4_m256d(__m256d sum4) {
00199 __m256d tmp = sum4;
00200 __m256d hsum4 = _mm256_add_pd(tmp, _mm256_permute2f128_pd(tmp, tmp, 0x1));
00201 __m128d hsum2 = _mm256_castpd256_pd128(hsum4);
00202 __m128d sum2 = _mm_hadd_pd(hsum2, hsum2);
00203 return _mm_cvtsd_f64(sum2);
00204 }
00205
00206
00207
00208 static double InnerProductSOA_avx2(double *A,
00209 float *crdx1, float *crdy1, float *crdz1,
00210 float *crdx2, float *crdy2, float *crdz2,
00211 const int cnt, const float *weight) {
00212 __m256d va0 = _mm256_set1_pd(0.0);
00213 __m256d va1 = _mm256_set1_pd(0.0);
00214 __m256d va2 = _mm256_set1_pd(0.0);
00215 __m256d va3 = _mm256_set1_pd(0.0);
00216 __m256d va4 = _mm256_set1_pd(0.0);
00217 __m256d va5 = _mm256_set1_pd(0.0);
00218 __m256d va6 = _mm256_set1_pd(0.0);
00219 __m256d va7 = _mm256_set1_pd(0.0);
00220 __m256d va8 = _mm256_set1_pd(0.0);
00221 __m256d vG1 = _mm256_set1_pd(0.0);
00222 __m256d vG2 = _mm256_set1_pd(0.0);
00223
00224 if (weight != NULL) {
00225 for (int i=0; i<cnt; i+=4) {
00226 __m128 xa4f = _mm_load_ps(crdx1 + i);
00227 __m128 ya4f = _mm_load_ps(crdy1 + i);
00228 __m128 za4f = _mm_load_ps(crdz1 + i);
00229
00230 __m256d xa4 = _mm256_cvtps_pd(xa4f);
00231 __m256d ya4 = _mm256_cvtps_pd(ya4f);
00232 __m256d za4 = _mm256_cvtps_pd(za4f);
00233
00234 __m256d gatmp = _mm256_mul_pd(xa4, xa4);
00235 gatmp = _mm256_fmadd_pd(ya4, ya4, gatmp);
00236 gatmp = _mm256_fmadd_pd(za4, za4, gatmp);
00237
00238 __m128 xb4f = _mm_load_ps(crdx2 + i);
00239 __m128 yb4f = _mm_load_ps(crdy2 + i);
00240 __m128 zb4f = _mm_load_ps(crdz2 + i);
00241
00242 __m256d xb4 = _mm256_cvtps_pd(xb4f);
00243 __m256d yb4 = _mm256_cvtps_pd(yb4f);
00244 __m256d zb4 = _mm256_cvtps_pd(zb4f);
00245
00246 __m256d gbtmp = _mm256_mul_pd(xb4, xb4);
00247 gbtmp = _mm256_fmadd_pd(yb4, yb4, gbtmp);
00248 gbtmp = _mm256_fmadd_pd(zb4, zb4, gbtmp);
00249
00250 __m128 w4f = _mm_load_ps(weight + i);
00251 __m256d w4 = _mm256_cvtps_pd(w4f);
00252
00253 vG1 = _mm256_fmadd_pd(w4, gatmp, vG1);
00254 vG2 = _mm256_fmadd_pd(w4, gbtmp, vG2);
00255
00256 va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00257 va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00258 va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00259
00260 va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00261 va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00262 va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00263
00264 va6 = _mm256_fmadd_pd(za4, xb4, va6);
00265 va7 = _mm256_fmadd_pd(za4, yb4, va7);
00266 va8 = _mm256_fmadd_pd(za4, zb4, va8);
00267 }
00268 } else {
00269 for (int i=0; i<cnt; i+=4) {
00270 __m128 xa4f = _mm_load_ps(crdx1 + i);
00271 __m128 ya4f = _mm_load_ps(crdy1 + i);
00272 __m128 za4f = _mm_load_ps(crdz1 + i);
00273
00274 __m256d xa4 = _mm256_cvtps_pd(xa4f);
00275 __m256d ya4 = _mm256_cvtps_pd(ya4f);
00276 __m256d za4 = _mm256_cvtps_pd(za4f);
00277
00278 vG1 = _mm256_fmadd_pd(xa4, xa4, vG1);
00279 vG1 = _mm256_fmadd_pd(ya4, ya4, vG1);
00280 vG1 = _mm256_fmadd_pd(za4, za4, vG1);
00281
00282 __m128 xb4f = _mm_load_ps(crdx2 + i);
00283 __m128 yb4f = _mm_load_ps(crdy2 + i);
00284 __m128 zb4f = _mm_load_ps(crdz2 + i);
00285
00286 __m256d xb4 = _mm256_cvtps_pd(xb4f);
00287 __m256d yb4 = _mm256_cvtps_pd(yb4f);
00288 __m256d zb4 = _mm256_cvtps_pd(zb4f);
00289
00290 vG2 = _mm256_fmadd_pd(xb4, xb4, vG2);
00291 vG2 = _mm256_fmadd_pd(yb4, yb4, vG2);
00292 vG2 = _mm256_fmadd_pd(zb4, zb4, vG2);
00293
00294 va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00295 va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00296 va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00297
00298 va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00299 va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00300 va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00301
00302 va6 = _mm256_fmadd_pd(za4, xb4, va6);
00303 va7 = _mm256_fmadd_pd(za4, yb4, va7);
00304 va8 = _mm256_fmadd_pd(za4, zb4, va8);
00305 }
00306 }
00307
00308 A[0] = hadd4_m256d(va0);
00309 A[1] = hadd4_m256d(va1);
00310 A[2] = hadd4_m256d(va2);
00311 A[3] = hadd4_m256d(va3);
00312 A[4] = hadd4_m256d(va4);
00313 A[5] = hadd4_m256d(va5);
00314 A[6] = hadd4_m256d(va6);
00315 A[7] = hadd4_m256d(va7);
00316 A[8] = hadd4_m256d(va8);
00317
00318 double G1 = hadd4_m256d(vG1);
00319 double G2 = hadd4_m256d(vG2);
00320
00321 return (G1 + G2) * 0.5;
00322 }
00323
00324 #endif
00325
00326
00327
00328 static double InnerProductSOA(double *A,
00329 float *crdx1, float *crdy1, float *crdz1,
00330 float *crdx2, float *crdy2, float *crdz2,
00331 const int cnt, const float *weight) {
00332 double G1=0.0, G2 = 0.0;
00333 memset(A, 0, sizeof(double) * 9);
00334
00335 double x1, x2, y1, y2, z1, z2;
00336 double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00337 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00338 if (weight != NULL) {
00339 for (int i=0; i<cnt; i++) {
00340 double w = weight[i];
00341 x1 = crdx1[i];
00342 y1 = crdy1[i];
00343 z1 = crdz1[i];
00344
00345 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00346
00347 x2 = crdx2[i];
00348 y2 = crdy2[i];
00349 z2 = crdz2[i];
00350
00351 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00352
00353 a0 += x1 * x2;
00354 a1 += x1 * y2;
00355 a2 += x1 * z2;
00356
00357 a3 += y1 * x2;
00358 a4 += y1 * y2;
00359 a5 += y1 * z2;
00360
00361 a6 += z1 * x2;
00362 a7 += z1 * y2;
00363 a8 += z1 * z2;
00364 }
00365 } else {
00366 for (int i=0; i<cnt; i++) {
00367 x1 = crdx1[i];
00368 y1 = crdy1[i];
00369 z1 = crdz1[i];
00370
00371 G1 += x1*x1 + y1*y1 + z1*z1;
00372
00373 x2 = crdx2[i];
00374 y2 = crdy2[i];
00375 z2 = crdz2[i];
00376
00377 G2 += x2*x2 + y2*y2 + z2*z2;
00378
00379 a0 += x1 * x2;
00380 a1 += x1 * y2;
00381 a2 += x1 * z2;
00382
00383 a3 += y1 * x2;
00384 a4 += y1 * y2;
00385 a5 += y1 * z2;
00386
00387 a6 += z1 * x2;
00388 a7 += z1 * y2;
00389 a8 += z1 * z2;
00390 }
00391 }
00392
00393 A[0] = a0;
00394 A[1] = a1;
00395 A[2] = a2;
00396
00397 A[3] = a3;
00398 A[4] = a4;
00399 A[5] = a5;
00400
00401 A[6] = a6;
00402 A[7] = a7;
00403 A[8] = a8;
00404
00405 return (G1 + G2) * 0.5;
00406 }
00407
00408
00409
00410
00411
00412 #if defined(__PGIC__) && defined(_OPENACC)
00413
00414 #if 0
00415 static void vecadd_acc(void) {
00416 printf("****** OpenACC test vecadd_acc()...\n");
00417
00418
00419 int n = 10000;
00420
00421
00422 double *restrict a;
00423 double *restrict b;
00424
00425
00426 double *restrict c;
00427
00428
00429 size_t bytes = n*sizeof(double);
00430
00431
00432 a = (double*)malloc(bytes);
00433 b = (double*)malloc(bytes);
00434 c = (double*)malloc(bytes);
00435
00436
00437 int i;
00438 for (i=0; i<n; i++) {
00439 a[i] = sin(i)*sin(i);
00440 b[i] = cos(i)*cos(i);
00441 }
00442
00443
00444 #pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
00445 for (i=0; i<n; i++) {
00446 c[i] = a[i] + b[i];
00447 }
00448
00449
00450 double sum = 0.0;
00451 for(i=0; i<n; i++) {
00452 sum += c[i];
00453 }
00454 sum = sum/n;
00455 printf("****** final result: %f *******\n", sum);
00456
00457
00458 free(a);
00459 free(b);
00460 free(c);
00461
00462 printf("****** OpenACC test vecadd_acc() done.\n");
00463 }
00464 #endif
00465
00466
00467
00468
00469
00470
00471 #define LOOP1D 1
00472
00473 #if defined(LOOP1D)
00474 #if defined(__PGIC__) && defined(_OPENACC)
00475 #pragma acc routine seq
00476 #endif
00477 static void acc_idx2sub_tril(long N, long ind, long *J, long *I) {
00478 long i, j;
00479 i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2));
00480 j = ind - N*i + i*(i-1)/2 + i;
00481
00482 *I = i;
00483 *J = j;
00484 }
00485 #endif
00486
00487 static void rmsdmat_qcp_acc(int cnt, int padcnt, int framecrdsz,
00488 int framecount,
00489 #if defined(LOOP1D)
00490 const float * restrict crds,
00491 #else
00492 const float * crds,
00493 #endif
00494
00495 float * rmsdmat) {
00496 printf("OpenACC rmsdmat_qcp_acc()...\n");
00497 printf("ACC cnt: %d padcnt: %d\n", cnt, padcnt);
00498
00499 printf("Copying input arrays to accelerators...\n");
00500 long totalsz = 3L * framecrdsz * framecount;
00501 printf("ACC copysz: %ld (3 * %d * %d)\n", totalsz, framecrdsz, framecount);
00502
00503 long matcnt = framecount * framecount;
00504 printf("ACC matcnt: %ld\n", matcnt);
00505
00506 printf("Running OpenACC kernels...\n");
00507 #if defined(LOOP1D)
00508 long i, j, k;
00509 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00510 for (k=0; k<(framecount*(framecount-1))/2; k++) {
00511 acc_idx2sub_tril(long(framecount-1), k, &i, &j);
00512 long x1addr = j * 3L * framecrdsz;
00513 {
00514 #else
00515 long i, j;
00516 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00517 for (j=0; j<framecount; j++) {
00518 long x1addr = j * 3L * framecrdsz;
00519
00520 for (i=0; i<j; i++) {
00521 #endif
00522
00523 long x2addr = i * 3L * framecrdsz;
00524
00525 double G1=0.0, G2=0.0;
00526
00527 double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00528 a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00529 #if 0
00530 if (weight != NULL) {
00531 double x1, x2, y1, y2, z1, z2;
00532 #pragma acc loop
00533 for (long l=0; l<cnt; l++) {
00534 double w = weight[l];
00535 x1 = crds[l + x1addr];
00536 y1 = crds[l + x1addr + framecrdsz];
00537 z1 = crds[l + x1addr + framecrdsz*2];
00538
00539 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00540
00541 x2 = crds[l + x2addr];
00542 y2 = crds[l + x2addr + framecrdsz];
00543 z2 = crds[l + x2addr + framecrdsz*2];
00544
00545 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00546
00547 a0 += x1 * x2;
00548 a1 += x1 * y2;
00549 a2 += x1 * z2;
00550
00551 a3 += y1 * x2;
00552 a4 += y1 * y2;
00553 a5 += y1 * z2;
00554
00555 a6 += z1 * x2;
00556 a7 += z1 * y2;
00557 a8 += z1 * z2;
00558 }
00559 } else {
00560 #endif
00561 double x1, x2, y1, y2, z1, z2;
00562 #pragma acc loop vector(256)
00563
00564 for (long l=0; l<cnt; l++) {
00565 x1 = crds[l + x1addr];
00566 y1 = crds[l + x1addr + framecrdsz];
00567 z1 = crds[l + x1addr + framecrdsz*2];
00568
00569 G1 += x1*x1 + y1*y1 + z1*z1;
00570
00571 x2 = crds[l + x2addr];
00572 y2 = crds[l + x2addr + framecrdsz];
00573 z2 = crds[l + x2addr + framecrdsz*2];
00574
00575 G2 += x2*x2 + y2*y2 + z2*z2;
00576
00577 a0 += x1 * x2;
00578 a1 += x1 * y2;
00579 a2 += x1 * z2;
00580
00581 a3 += y1 * x2;
00582 a4 += y1 * y2;
00583 a5 += y1 * z2;
00584
00585 a6 += z1 * x2;
00586 a7 += z1 * y2;
00587 a8 += z1 * z2;
00588 }
00589 #if 0
00590 }
00591 #endif
00592
00593 double A[9];
00594 A[0] = a0;
00595 A[1] = a1;
00596 A[2] = a2;
00597
00598 A[3] = a3;
00599 A[4] = a4;
00600 A[5] = a5;
00601
00602 A[6] = a6;
00603 A[7] = a7;
00604 A[8] = a8;
00605
00606 double E0 = (G1 + G2) * 0.5;
00607
00608
00609 float rmsd;
00610 FastCalcRMSDAndRotation(NULL, A, &rmsd, E0, cnt, -1);
00611 #if defined(LOOP1D)
00612 rmsdmat[k]=rmsd;
00613 #else
00614 rmsdmat[j*framecount + i]=rmsd;
00615 #endif
00616 }
00617 }
00618
00619 printf("ACC done.\n");
00620 }
00621
00622 #endif
00623
00624
00625 #if 0
00626 static double InnerProductAOS(double *A, double *coords1, double *coords2,
00627 const int cnt, const double *weight) {
00628 double G1=0.0, G2=0.0;
00629 memset(A, 0, sizeof(double) * 9);
00630
00631 long i;
00632 double x1, x2, y1, y2, z1, z2;
00633 if (weight != NULL) {
00634 for (i=0; i<cnt; i++) {
00635 double w = weight[i];
00636 long idx = i*3;
00637 x1 = coords1[idx ];
00638 y1 = coords1[idx+1];
00639 z1 = coords1[idx+2];
00640
00641 G1 += w * (x1*x1 + y1*y1 + z1*z1);
00642
00643 x2 = coords2[idx ];
00644 y2 = coords2[idx+1];
00645 z2 = coords2[idx+2];
00646
00647 G2 += w * (x2*x2 + y2*y2 + z2*z2);
00648
00649 A[0] += (x1 * x2);
00650 A[1] += (x1 * y2);
00651 A[2] += (x1 * z2);
00652
00653 A[3] += (y1 * x2);
00654 A[4] += (y1 * y2);
00655 A[5] += (y1 * z2);
00656
00657 A[6] += (z1 * x2);
00658 A[7] += (z1 * y2);
00659 A[8] += (z1 * z2);
00660 }
00661 } else {
00662 for (i=0; i<cnt; i++) {
00663 long idx = i*3;
00664 x1 = coords1[idx ];
00665 y1 = coords1[idx+1];
00666 z1 = coords1[idx+2];
00667
00668 G1 += x1*x1 + y1*y1 + z1*z1;
00669
00670 x2 = coords2[idx ];
00671 y2 = coords2[idx+1];
00672 z2 = coords2[idx+2];
00673
00674 G2 += x2*x2 + y2*y2 + z2*z2;
00675
00676 A[0] += (x1 * x2);
00677 A[1] += (x1 * y2);
00678 A[2] += (x1 * z2);
00679
00680 A[3] += (y1 * x2);
00681 A[4] += (y1 * y2);
00682 A[5] += (y1 * z2);
00683
00684 A[6] += (z1 * x2);
00685 A[7] += (z1 * y2);
00686 A[8] += (z1 * z2);
00687 }
00688 }
00689
00690 return (G1 + G2) * 0.5;
00691 }
00692 #endif
00693
00694
00695 void com_soa(int cnt,
00696 float *&soax, float *&soay, float *&soaz,
00697 double &comx, double &comy, double &comz,
00698 const float *weight) {
00699 comx=comy=comz=0.0;
00700
00701 if (weight != NULL) {
00702 double wsum = 0.0;
00703
00704
00705 int i;
00706 for (i=0; i<cnt; i++) {
00707 double w = weight[i];
00708 wsum += w;
00709
00710 comx += soax[i] * w;
00711 comy += soay[i] * w;
00712 comz += soaz[i] * w;
00713 }
00714 double wsumnorm = 1.0 / wsum;
00715 comx *= wsumnorm;
00716 comy *= wsumnorm;
00717 comz *= wsumnorm;
00718 } else {
00719
00720 int i;
00721 for (i=0; i<cnt; i++) {
00722 comx += soax[i];
00723 comy += soay[i];
00724 comz += soaz[i];
00725 }
00726 double avenorm = 1.0 / ((double) cnt);
00727 comx *= avenorm;
00728 comy *= avenorm;
00729 comz *= avenorm;
00730 }
00731 }
00732
00733
00734
00735 int center_convert_soa(const AtomSel *sel, int num, const float *framepos,
00736 const float *weight,
00737 float *&soax, float *&soay, float *&soaz) {
00738
00739 int cnt = sel->selected;
00740 soax = (float *) calloc(1, (cnt + 16)*sizeof(float));
00741 soay = (float *) calloc(1, (cnt + 16)*sizeof(float));
00742 soaz = (float *) calloc(1, (cnt + 16)*sizeof(float));
00743
00744 int selind = sel->firstsel;
00745 double comx=0.0, comy=0.0, comz=0.0;
00746
00747 int i;
00748 for (i=0; i<cnt; i++) {
00749
00750
00751 while (!sel->on[selind])
00752 selind++;
00753
00754
00755 long addr = 3*selind;
00756 float tx = framepos[addr ];
00757 float ty = framepos[addr + 1];
00758 float tz = framepos[addr + 2];
00759
00760 comx += tx;
00761 comy += ty;
00762 comz += tz;
00763
00764 soax[i] = tx;
00765 soay[i] = ty;
00766 soaz[i] = tz;
00767
00768 selind++;
00769 }
00770
00771 double avenorm = 1.0 / ((double) cnt);
00772 comx *= avenorm;
00773 comy *= avenorm;
00774 comz *= avenorm;
00775
00776 #if 0
00777 printf("center_convert_soa(): structure com: %g %g %g\n", comx, comy, comz);
00778 #endif
00779
00780
00781 for (i=0; i<cnt; i++) {
00782 soax[i] -= float(comx);
00783 soay[i] -= float(comy);
00784 soaz[i] -= float(comz);
00785 }
00786
00787 #if 0
00788
00789 com_soa(cnt, soax, soay, soaz, comx, comy, comz, weight);
00790 printf("center_convert_soa(): centered com: %lg %lg %lg\n", comx, comy, comz);
00791 #endif
00792
00793 return 0;
00794 }
00795
00796
00797 int center_convert_single_soa(const AtomSel *sel, int num,
00798 const float *framepos,
00799 const float *weight,
00800 float *soax, float *soay, float *soaz) {
00801
00802 int cnt = sel->selected;
00803 int selind = sel->firstsel;
00804 double comx=0.0, comy=0.0, comz=0.0;
00805
00806 int i;
00807 for (i=0; i<cnt; i++) {
00808
00809
00810 while (!sel->on[selind])
00811 selind++;
00812
00813
00814 long addr = 3*selind;
00815 float tx = framepos[addr ];
00816 float ty = framepos[addr + 1];
00817 float tz = framepos[addr + 2];
00818
00819 comx += tx;
00820 comy += ty;
00821 comz += tz;
00822
00823 soax[i] = tx;
00824 soay[i] = ty;
00825 soaz[i] = tz;
00826
00827 selind++;
00828 }
00829
00830 double avenorm = 1.0 / ((double) cnt);
00831 comx *= avenorm;
00832 comy *= avenorm;
00833 comz *= avenorm;
00834
00835
00836 for (i=0; i<cnt; i++) {
00837 soax[i] -= float(comx);
00838 soay[i] -= float(comy);
00839 soaz[i] -= float(comz);
00840 }
00841
00842 return 0;
00843 }
00844
00845
00846 int measure_rmsd_qcp(VMDApp *app,
00847 const AtomSel *sel1, const AtomSel *sel2,
00848 int num, const float *framepos1, const float *framepos2,
00849 float *weight, float *rmsd) {
00850 if (!sel1 || !sel2) return MEASURE_ERR_NOSEL;
00851 if (sel1->selected < 1 || sel2->selected < 1) return MEASURE_ERR_NOSEL;
00852 if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
00853
00854
00855 if (sel1->selected != sel2->selected) return MEASURE_ERR_MISMATCHEDCNT;
00856
00857 #if 0
00858
00859
00860
00861 int sel_flg;
00862 if (num == sel1->num_atoms) {
00863 sel_flg = 1;
00864 } else {
00865 sel_flg = 0;
00866 }
00867 #endif
00868
00869
00870
00871
00872 float *sel1x, *sel1y, *sel1z, *sel2x, *sel2y, *sel2z;
00873 center_convert_soa(sel1, num, framepos1, weight, sel1x, sel1y, sel1z);
00874 center_convert_soa(sel2, num, framepos2, weight, sel2x, sel2y, sel2z);
00875
00876
00877 double E0 = 0;
00878 double A[9];
00879 E0 = InnerProductSOA(A,
00880 sel1x, sel1y, sel1z,
00881 sel2x, sel2y, sel2z,
00882 sel1->selected, NULL );
00883
00884 #if 0
00885 printf("QCP inner product results:\n");
00886 printf(" E0: %g\n", E0);
00887 int i;
00888 for (i=0; i<9; i+=3)
00889 printf("A[%d-%d]: %g %g %g\n", i, i+2, A[i], A[i+1], A[i+2]);
00890 printf("\n");
00891 #endif
00892
00893
00894 FastCalcRMSDAndRotation(NULL, A, rmsd, E0, sel1->selected, -1);
00895
00896 free(sel1x);
00897 free(sel1y);
00898 free(sel1z);
00899
00900 free(sel2x);
00901 free(sel2y);
00902 free(sel2z);
00903
00904 return MEASURE_NOERR;
00905 }
00906
00907
00908 #if 0
00909
00910 static int sub2idx_tril(long N, long i, long j, long *ind) {
00911
00912 *ind = j + N*i - i*(i-1)/2;
00913 return 0;
00914 }
00915 #endif
00916
00917
00918 static int idx2sub_tril(long N, long ind, long *J, long *I) {
00919 long i, j;
00920
00921 if (ind > (N*(N+1)/2)) {
00922 return -1;
00923 }
00924
00925
00926 double tmp2np1 = 2*N+1;
00927 i = long(floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2));
00928
00929 j = ind - N*i + i*(i-1)/2 + i;
00930
00931 *I = i;
00932 *J = j+1;
00933
00934 return 0;
00935 }
00936
00937
00938 typedef struct {
00939 const AtomSel *sel;
00940 int first;
00941 int last;
00942 int step;
00943 float *rmsdmat;
00944 int padcnt;
00945 int framecrdsz;
00946 float *crds;
00947 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00948 int useavx512;
00949 #endif
00950 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00951 int useavx2;
00952 #endif
00953 #if (VMDQCPUSEVSX && defined(__VEC__))
00954 int usevsx;
00955 #endif
00956 } qcprmsdthreadparms;
00957
00958
00959 static void * measure_rmsdmat_qcp_thread(void *voidparms) {
00960 int threadid;
00961 qcprmsdthreadparms *parms = NULL;
00962 #if defined(VMDQCPUSETHRPOOL)
00963 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00964 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00965 #else
00966 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00967 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00968 #endif
00969
00970
00971
00972
00973 const AtomSel *sel = parms->sel;
00974 float *rmsdmat = parms->rmsdmat;
00975
00976
00977
00978
00979 int framecrdsz = parms->framecrdsz;
00980 float *crds = parms->crds;
00981 int first = parms->first;
00982 int last = parms->last;
00983 int step = parms->step;
00984 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00985 int useavx2 = parms->useavx2;
00986 #endif
00987 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00988 int useavx512 = parms->useavx512;
00989 #endif
00990 #if (VMDQCPUSEVSX && defined(__VEC__))
00991 int usevsx = parms->usevsx;
00992 #endif
00993
00994 #if 0
00995 printf("qcpthread[%d] running... %s\n", threadid,
00996 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00997 (useavx512) ? "(AVX512)" : "(C++)");
00998 #elif (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00999 (useavx2) ? "(AVX2)" : "(C++)");
01000 #elif (VMDQCPUSEVSX && defined(__VEC__))
01001 (useavsx) ? "(VSX)" : "(C++)");
01002 #else
01003 "(C++)");
01004 #endif
01005 #endif
01006
01007 int framecount = (last - first + 1) / step;
01008
01009 wkf_tasktile_t tile;
01010 #if defined(VMDQCPUSETHRPOOL)
01011 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
01012 #else
01013 while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
01014 #endif
01015 long idx;
01016
01017 for (idx=tile.start; idx<tile.end; idx++) {
01018 long i, j;
01019
01020
01021
01022 if (idx2sub_tril(framecount-1, idx, &i, &j)) {
01023 printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx);
01024 break;
01025 }
01026
01027
01028 double A[9];
01029 double E0 = 0;
01030
01031 float *xj = crds + (j * 3 * framecrdsz);
01032 float *yj = xj + framecrdsz;
01033 float *zj = xj + framecrdsz*2;
01034
01035 float *xi = crds + (i * 3 * framecrdsz);
01036 float *yi = xi + framecrdsz;
01037 float *zi = xi + framecrdsz*2;
01038
01039 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
01040 if (useavx512) {
01041 E0 = InnerProductSOA_avx512(A, xj, yj, zj, xi, yi, zi,
01042 sel->selected, NULL );
01043 } else
01044 #endif
01045 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
01046 if (useavx2) {
01047 E0 = InnerProductSOA_avx2(A, xj, yj, zj, xi, yi, zi,
01048 sel->selected, NULL );
01049 } else
01050 #endif
01051 E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
01052 sel->selected, NULL );
01053
01054
01055 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
01056 E0, sel->selected, -1);
01057
01058
01059 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01060 }
01061 }
01062
01063 return NULL;
01064 }
01065
01066
01067
01068 int measure_rmsdmat_qcp(VMDApp *app,
01069 const AtomSel *sel, MoleculeList *mlist,
01070 int num, float *weight,
01071 int first, int last, int step,
01072 float *rmsdmat) {
01073 if (!sel) return MEASURE_ERR_NOSEL;
01074 if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01075
01076
01077 Molecule *mymol = mlist->mol_from_id(sel->molid());
01078 int maxframes = mymol->numframes();
01079
01080
01081 if (last == -1)
01082 last = maxframes-1;
01083
01084 if (maxframes == 0 || first < 0 || first > last ||
01085 last >= maxframes || step <= 0)
01086 return MEASURE_ERR_BADFRAMERANGE;
01087
01088
01089 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01090
01091
01092
01093
01094
01095 #endif
01096 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01097 int useavx2=1;
01098 if (getenv("VMDNOAVX2")) {
01099 useavx2=0;
01100 }
01101 #endif
01102 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01103 int useavx512=1;
01104 if (getenv("VMDNOAVX512")) {
01105 useavx512=0;
01106 }
01107 #endif
01108 #if (VMDQCPUSEVSX && defined(__VEC__))
01109 int usevsx=1;
01110 if (getenv("VMDNOVSX")) {
01111 usevsx=0;
01112 }
01113 #endif
01114
01115 #if 0
01116
01117
01118
01119 int sel_flg;
01120 if (num == sel->num_atoms) {
01121 sel_flg = 1;
01122 } else {
01123 sel_flg = 0;
01124 }
01125 #endif
01126
01127
01128 wkf_timerhandle timer;
01129 timer=wkf_timer_create();
01130 wkf_timer_start(timer);
01131
01132
01133
01134
01135
01136 int framecount = (last - first + 1) / step;
01137
01138 int padcnt = (num + 255) & ~255;
01139 int framecrdsz = padcnt + 256;
01140 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01141
01142 int frame;
01143 for (frame=first; frame<=last; frame+=step) {
01144 const float *framepos = (mymol->get_frame(frame))->pos;
01145 float *xc = crds + (frame * 3L * framecrdsz);
01146 float *yc = xc + framecrdsz;
01147 float *zc = xc + framecrdsz*2;
01148
01149 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01150 }
01151
01152 double converttime = wkf_timer_timenow(timer);
01153
01154 #if !(defined(__PGIC__) && defined(_OPENACC))
01155 #if defined(VMDTHREADS)
01156 int numprocs = wkf_thread_numprocessors();
01157 #else
01158 int numprocs = 1;
01159 #endif
01160
01161
01162
01163
01164 qcprmsdthreadparms parms;
01165 memset(&parms, 0, sizeof(parms));
01166 parms.sel = sel;
01167 parms.rmsdmat = rmsdmat;
01168 parms.padcnt = padcnt;
01169 parms.framecrdsz = framecrdsz;
01170 parms.crds = crds;
01171 parms.first = first;
01172 parms.last = last;
01173 parms.step = step;
01174 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01175 parms.useavx2 = useavx2;
01176 #endif
01177 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01178 parms.useavx512 = useavx512;
01179 #endif
01180 #if (VMDQCPUSEVSX && defined(__VEC__))
01181 parms.usevsx = usevsx;
01182 #endif
01183
01184
01185 wkf_tasktile_t tile;
01186 tile.start=0;
01187 tile.end=(framecount-1)*(framecount-1)/2;
01188
01189 #if defined(VMDORBUSETHRPOOL)
01190 wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01191 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01192 #else
01193 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01194 #endif
01195 #elif defined(__PGIC__) && defined(_OPENACC)
01196
01197 rmsdmat_qcp_acc(sel->selected, padcnt, framecrdsz, framecount, crds,
01198
01199 rmsdmat);
01200 #else
01201 int i, j;
01202 for (j=0; j<framecount; j++) {
01203 float *xj = crds + (j * 3 * framecrdsz);
01204 float *yj = xj + framecrdsz;
01205 float *zj = xj + framecrdsz*2;
01206 for (i=0; i<j; i++) {
01207
01208 double A[9];
01209
01210 float *xi = crds + (i * 3 * framecrdsz);
01211 float *yi = xi + framecrdsz;
01212 float *zi = xi + framecrdsz*2;
01213
01214 double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
01215 sel->selected, NULL );
01216
01217
01218 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
01219 E0, sel->selected, -1);
01220
01221
01222 rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01223 }
01224 }
01225 #endif
01226
01227
01228 for (long l=0; l<framecount; l++) {
01229 rmsdmat[l*framecount + l] = 0.0;
01230 }
01231
01232 double rmsdtime = wkf_timer_timenow(timer) - converttime;
01233
01234
01235 free(crds);
01236
01237 #if 1
01238 double totaltime = wkf_timer_timenow(timer);
01239 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime);
01240 #endif
01241
01242 wkf_timer_destroy(timer);
01243
01244 return MEASURE_NOERR;
01245 }
01246
01247
01248
01249 int measure_rmsdmat_qcp_ooc(VMDApp *app,
01250 const AtomSel *sel, MoleculeList *mlist,
01251 int nfiles, const char **trjfileset,
01252 int num, float *weight,
01253 int first, int last, int step,
01254 int &framecount, float *&rmsdmat) {
01255 if (!sel) return MEASURE_ERR_NOSEL;
01256 if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01257
01258
01259
01260
01261
01262
01263 framecount = (last - first + 1) / step;
01264
01265 printf("** measure_rmsdmat_qcp_ooc(): \n");
01266 printf("** first: %d last: %d step: %d nfiles: %d count: %d\n",
01267 first, last, step, nfiles, framecount);
01268
01269 rmsdmat = (float *) calloc(1, framecount * framecount * sizeof(float));
01270
01271
01272 #if defined(VMDCUDA)
01273 qcp_soa_gpu_ooc(app->cuda->get_cuda_devpool(),
01274 nfiles, trjfileset, sel, first, last, step, rmsdmat);
01275 #endif
01276
01277 #if 0
01278 Molecule *mymol = mlist->mol_from_id(sel->molid());
01279
01280 int maxframes = mymol->numframes();
01281
01282
01283 if (last == -1)
01284 last = maxframes-1;
01285
01286 if (maxframes == 0 || first < 0 || first > last ||
01287 last >= maxframes || step <= 0)
01288 return MEASURE_ERR_BADFRAMERANGE;
01289
01290
01291
01292
01293 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01294 int useavx2=1;
01295 if (getenv("VMDNOAVX2")) {
01296 useavx2=0;
01297 }
01298 #endif
01299
01300
01301 wkf_timerhandle timer;
01302 timer=wkf_timer_create();
01303 wkf_timer_start(timer);
01304
01305
01306
01307
01308 int framecount = (last - first + 1) / step;
01309
01310 int padcnt = (num + 255) & ~255;
01311 int framecrdsz = padcnt + 256;
01312 float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01313
01314 int frame;
01315 for (frame=first; frame<=last; frame+=step) {
01316 const float *framepos = (mymol->get_frame(frame))->pos;
01317 float *xc = crds + (frame * 3L * framecrdsz);
01318 float *yc = xc + framecrdsz;
01319 float *zc = xc + framecrdsz*2;
01320
01321 center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01322 }
01323
01324 double converttime = wkf_timer_timenow(timer);
01325
01326 #if defined(VMDTHREADS)
01327 int numprocs = wkf_thread_numprocessors();
01328 #else
01329 int numprocs = 1;
01330 #endif
01331
01332
01333
01334
01335 qcprmsdthreadparms parms;
01336 memset(&parms, 0, sizeof(parms));
01337 parms.sel = sel;
01338 parms.rmsdmat = rmsdmat;
01339 parms.padcnt = padcnt;
01340 parms.framecrdsz = framecrdsz;
01341 parms.crds = crds;
01342 parms.first = first;
01343 parms.last = last;
01344 parms.step = step;
01345 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01346 parms.useavx2 = useavx2;
01347 #endif
01348
01349
01350 wkf_tasktile_t tile;
01351 tile.start=0;
01352 tile.end=(framecount-1)*(framecount-1)/2;
01353
01354 #if defined(VMDORBUSETHRPOOL)
01355 wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01356 rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01357 #else
01358 wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01359 #endif
01360
01361
01362 for (long l=0; l<framecount; l++) {
01363 rmsdmat[l*framecount + l] = 1.0;
01364 }
01365
01366 double rmsdtime = wkf_timer_timenow(timer) - converttime;
01367
01368
01369 free(crds);
01370
01371 #if 1
01372 double totaltime = wkf_timer_timenow(timer);
01373 printf("QCP RMSD Matrix calculation time: SOA selection: %.3f RMSD solve: %.3f total: %.3f\n", converttime, rmsdtime, totaltime);
01374 #endif
01375
01376 wkf_timer_destroy(timer);
01377 #endif
01378
01379 return MEASURE_NOERR;
01380 }
01381
01382
01383
01384
01385
01386
01387
01388
01389
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411
01412
01413
01414
01415
01416
01417
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429 #if defined(__PGIC__)
01430 #pragma acc routine seq
01431 #endif
01432 int FastCalcRMSDAndRotation(double *rot, double *A, float *rmsd,
01433 double E0, int len, double minScore) {
01434 double Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz;
01435 double Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2,
01436 SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2,
01437 SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy,
01438 SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy;
01439 double C[4];
01440 int i;
01441 double mxEigenV;
01442 double oldg = 0.0;
01443 double b, a, delta, rms, qsqr;
01444 double q1, q2, q3, q4, normq;
01445 double a11, a12, a13, a14, a21, a22, a23, a24;
01446 double a31, a32, a33, a34, a41, a42, a43, a44;
01447 double a2, x2, y2, z2;
01448 double xy, az, zx, ay, yz, ax;
01449 double a3344_4334, a3244_4234, a3243_4233, a3143_4133,a3144_4134, a3142_4132;
01450 double evecprec = 1e-6;
01451 double evalprec = 1e-11;
01452
01453 Sxx = A[0]; Sxy = A[1]; Sxz = A[2];
01454 Syx = A[3]; Syy = A[4]; Syz = A[5];
01455 Szx = A[6]; Szy = A[7]; Szz = A[8];
01456
01457 Sxx2 = Sxx * Sxx;
01458 Syy2 = Syy * Syy;
01459 Szz2 = Szz * Szz;
01460
01461 Sxy2 = Sxy * Sxy;
01462 Syz2 = Syz * Syz;
01463 Sxz2 = Sxz * Sxz;
01464
01465 Syx2 = Syx * Syx;
01466 Szy2 = Szy * Szy;
01467 Szx2 = Szx * Szx;
01468
01469 SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz);
01470 Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2;
01471
01472 C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2);
01473 C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz);
01474
01475 SxzpSzx = Sxz + Szx;
01476 SyzpSzy = Syz + Szy;
01477 SxypSyx = Sxy + Syx;
01478 SyzmSzy = Syz - Szy;
01479 SxzmSzx = Sxz - Szx;
01480 SxymSyx = Sxy - Syx;
01481 SxxpSyy = Sxx + Syy;
01482 SxxmSyy = Sxx - Syy;
01483 Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2;
01484
01485 C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2
01486 + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2)
01487 + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz))
01488 + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz))
01489 + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz))
01490 + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz));
01491
01492
01493 mxEigenV = E0;
01494 for (i = 0; i < 50; ++i) {
01495 oldg = mxEigenV;
01496 x2 = mxEigenV*mxEigenV;
01497 b = (x2 + C[2])*mxEigenV;
01498 a = b + C[1];
01499 delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a));
01500 mxEigenV -= delta;
01501 #if 0
01502 printf("QCP diff[%3d]: %16g %16g %16g\n", i, mxEigenV - oldg, evalprec*mxEigenV, mxEigenV);
01503 #endif
01504 if (fabs(mxEigenV - oldg) < fabs(evalprec*mxEigenV))
01505 break;
01506 }
01507
01508 #if !defined(__PGIC__)
01509 if (i == 50)
01510 printf("MeasureQCP: More than %d iterations needed!\n", i);
01511 #endif
01512
01513
01514
01515 rms = sqrt(fabs(2.0 * (E0 - mxEigenV)/len));
01516 (*rmsd) = float(rms);
01517
01518
01519 if (minScore > 0)
01520 if (rms < minScore)
01521 return (-1);
01522
01523
01524
01525 if (rot != NULL) {
01526 a11 = SxxpSyy + Szz-mxEigenV; a12 = SyzmSzy; a13 = - SxzmSzx; a14 = SxymSyx;
01527 a21 = SyzmSzy; a22 = SxxmSyy - Szz-mxEigenV; a23 = SxypSyx; a24= SxzpSzx;
01528 a31 = a13; a32 = a23; a33 = Syy-Sxx-Szz - mxEigenV; a34 = SyzpSzy;
01529 a41 = a14; a42 = a24; a43 = a34; a44 = Szz - SxxpSyy - mxEigenV;
01530 a3344_4334 = a33 * a44 - a43 * a34; a3244_4234 = a32 * a44-a42*a34;
01531 a3243_4233 = a32 * a43 - a42 * a33; a3143_4133 = a31 * a43-a41*a33;
01532 a3144_4134 = a31 * a44 - a41 * a34; a3142_4132 = a31 * a42-a41*a32;
01533 q1 = a22*a3344_4334-a23*a3244_4234+a24*a3243_4233;
01534 q2 = -a21*a3344_4334+a23*a3144_4134-a24*a3143_4133;
01535 q3 = a21*a3244_4234-a22*a3144_4134+a24*a3142_4132;
01536 q4 = -a21*a3243_4233+a22*a3143_4133-a23*a3142_4132;
01537
01538 qsqr = q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4;
01539
01540
01541
01542
01543
01544
01545 if (qsqr < evecprec) {
01546 q1 = a12*a3344_4334 - a13*a3244_4234 + a14*a3243_4233;
01547 q2 = -a11*a3344_4334 + a13*a3144_4134 - a14*a3143_4133;
01548 q3 = a11*a3244_4234 - a12*a3144_4134 + a14*a3142_4132;
01549 q4 = -a11*a3243_4233 + a12*a3143_4133 - a13*a3142_4132;
01550 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01551
01552 if (qsqr < evecprec) {
01553 double a1324_1423 = a13*a24 - a14*a23, a1224_1422 = a12*a24 - a14*a22;
01554 double a1223_1322 = a12*a23 - a13*a22, a1124_1421 = a11*a24 - a14*a21;
01555 double a1123_1321 = a11*a23 - a13*a21, a1122_1221 = a11*a22 - a12*a21;
01556
01557 q1 = a42 * a1324_1423 - a43 * a1224_1422 + a44 * a1223_1322;
01558 q2 = -a41 * a1324_1423 + a43 * a1124_1421 - a44 * a1123_1321;
01559 q3 = a41 * a1224_1422 - a42 * a1124_1421 + a44 * a1122_1221;
01560 q4 = -a41 * a1223_1322 + a42 * a1123_1321 - a43 * a1122_1221;
01561 qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01562
01563 if (qsqr < evecprec) {
01564 q1 = a32 * a1324_1423 - a33 * a1224_1422 + a34 * a1223_1322;
01565 q2 = -a31 * a1324_1423 + a33 * a1124_1421 - a34 * a1123_1321;
01566 q3 = a31 * a1224_1422 - a32 * a1124_1421 + a34 * a1122_1221;
01567 q4 = -a31 * a1223_1322 + a32 * a1123_1321 - a33 * a1122_1221;
01568 qsqr = q1*q1 + q2 *q2 + q3*q3 + q4*q4;
01569
01570 if (qsqr < evecprec) {
01571
01572 rot[0] = rot[4] = rot[8] = 1.0;
01573 rot[1] = rot[2] = rot[3] = rot[5] = rot[6] = rot[7] = 0.0;
01574
01575 return(0);
01576 }
01577 }
01578 }
01579 }
01580
01581 normq = sqrt(qsqr);
01582 q1 /= normq;
01583 q2 /= normq;
01584 q3 /= normq;
01585 q4 /= normq;
01586
01587 a2 = q1 * q1;
01588 x2 = q2 * q2;
01589 y2 = q3 * q3;
01590 z2 = q4 * q4;
01591
01592 xy = q2 * q3;
01593 az = q1 * q4;
01594 zx = q4 * q2;
01595 ay = q1 * q3;
01596 yz = q3 * q4;
01597 ax = q1 * q2;
01598
01599 rot[0] = a2 + x2 - y2 - z2;
01600 rot[1] = 2 * (xy + az);
01601 rot[2] = 2 * (zx - ay);
01602 rot[3] = 2 * (xy - az);
01603 rot[4] = a2 - x2 + y2 - z2;
01604 rot[5] = 2 * (yz + ax);
01605 rot[6] = 2 * (zx + ay);
01606 rot[7] = 2 * (yz - ax);
01607 rot[8] = a2 - x2 - y2 + z2;
01608 }
01609
01610 return 1;
01611 }
01612