00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 #include <cuda.h>
00032 #if CUDART_VERSION >= 9000
00033 #include <cuda_fp16.h>
00034 #endif
00035 #if CUDART_VERSION < 4000
00036 #error The VMD MDFF feature requires CUDA 4.0 or later
00037 #endif
00038
00039 #include <float.h>
00040
00041
00042 #include "Inform.h"
00043 #include "utilities.h"
00044 #include "WKFThreads.h"
00045 #include "WKFUtils.h"
00046 #include "CUDAKernels.h"
00047 #include "CUDASpatialSearch.h"
00048
00049 #include "AtomSel.h"
00050 #include "VMDApp.h"
00051 #include "MoleculeList.h"
00052 #include "VolumetricData.h"
00053 #include "VolMapCreate.h"
00054
00055 #include "CUDAMDFF.h"
00056
00057 #include <tcl.h>
00058 #include "TclCommands.h"
00059
00060 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00061 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00062
00063 #if 1
00064 #define CUERR { cudaError_t err; \
00065 if ((err = cudaGetLastError()) != cudaSuccess) { \
00066 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00067 printf("Thread aborting...\n"); \
00068 return NULL; }}
00069 #else
00070 #define CUERR
00071 #endif
00072
00073
00074
00075 static int check_gpu_compute20(cudaError_t &err) {
00076 cudaDeviceProp deviceProp;
00077 int dev;
00078 if (cudaGetDevice(&dev) != cudaSuccess)
00079 return -1;
00080
00081 memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00082
00083 if (cudaGetDeviceProperties(&deviceProp, dev) != cudaSuccess) {
00084 err = cudaGetLastError();
00085 return -1;
00086 }
00087
00088
00089 if (deviceProp.major < 2)
00090 return -1;
00091
00092 return 0;
00093 }
00094
00095
00096
00097
00098
00099
00100
00101
00102 inline __host__ __device__ float2 operator+(float2 a, float2 b) {
00103 return make_float2(a.x + b.x, a.y + b.y);
00104 }
00105
00106
00107 inline __host__ __device__ void operator+=(float2 &a, float2 b) {
00108 a.x += b.x;
00109 a.y += b.y;
00110 }
00111
00112 inline __host__ __device__ float4 operator+(float4 a, float4 b) {
00113 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
00114 }
00115
00116
00117 inline __host__ __device__ void operator+=(float4 &a, float4 b) {
00118 a.x += b.x;
00119 a.y += b.y;
00120 a.z += b.z;
00121 a.w += b.w;
00122 }
00123
00124
00125
00126
00127
00128
00129
00130
00131 inline __device__ void convert_density(float & df, float df2) {
00132 df = df2;
00133 }
00134
00135
00136
00137 inline __device__ void convert_density(unsigned short & dh, float df2) {
00138 dh = __float2half_rn(df2);
00139 }
00140
00141
00142
00143
00144
00145 #if 1 && (CUDART_VERSION >= 4000)
00146 #define RESTRICT __restrict__
00147 #else
00148 #define RESTRICT
00149 #endif
00150
00151
00152
00153
00154
00155 #define GGRIDSZ 8.0f
00156 #define GBLOCKSZX 8
00157 #define GBLOCKSZY 8
00158
00159 #define GBLOCKSZZ 2
00160 #define GUNROLL 4
00161
00162 #define TOTALBLOCKSZ (GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ)
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173 __device__ unsigned int tbcatomic[3] = {0, 0, 0};
00174
00175 __device__ void reset_atomic_counter(unsigned int *counter) {
00176 counter[0] = 0;
00177 __threadfence();
00178 }
00179
00180
00181
00182
00183
00184
00185
00186 inline __device__ void calc_ac_cell_ids(int3 &abmin, int3 &abmax,
00187 int3 acncells,
00188 float3 acoriginoffset,
00189 float gridspacing,
00190 float acgridspacing,
00191 float invacgridspacing) {
00192
00193 abmin.x = (acoriginoffset.x + (blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00194 abmin.y = (acoriginoffset.y + (blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00195 abmin.z = (acoriginoffset.z + (blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00196
00197
00198 abmax.x = (acoriginoffset.x + ((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00199 abmax.y = (acoriginoffset.y + ((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00200 abmax.z = (acoriginoffset.z + ((blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00201
00202 abmin.x = (abmin.x < 0) ? 0 : abmin.x;
00203 abmin.y = (abmin.y < 0) ? 0 : abmin.y;
00204 abmin.z = (abmin.z < 0) ? 0 : abmin.z;
00205 abmax.x = (abmax.x >= acncells.x-1) ? acncells.x-1 : abmax.x;
00206 abmax.y = (abmax.y >= acncells.y-1) ? acncells.y-1 : abmax.y;
00207 abmax.z = (abmax.z >= acncells.z-1) ? acncells.z-1 : abmax.z;
00208 }
00209
00210
00211
00212 inline __device__ void calc_densities(int xindex, int yindex, int zindex,
00213 const float4 * RESTRICT sorted_xyzr,
00214 int3 abmin,
00215 int3 abmax,
00216 int3 acncells,
00217 float3 acoriginoffset,
00218 const uint2 * RESTRICT cellStartEnd,
00219 float gridspacing,
00220 float &densityval1
00221 #if GUNROLL >= 2
00222 ,float &densityval2
00223 #endif
00224 #if GUNROLL >= 4
00225 ,float &densityval3
00226 ,float &densityval4
00227 #endif
00228 ) {
00229 float coorx = acoriginoffset.x + gridspacing * xindex;
00230 float coory = acoriginoffset.y + gridspacing * yindex;
00231 float coorz = acoriginoffset.z + gridspacing * zindex;
00232
00233 int acplanesz = acncells.x * acncells.y;
00234 int xab, yab, zab;
00235 for (zab=abmin.z; zab<=abmax.z; zab++) {
00236 for (yab=abmin.y; yab<=abmax.y; yab++) {
00237 for (xab=abmin.x; xab<=abmax.x; xab++) {
00238 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00239 uint2 atomstartend = cellStartEnd[abcellidx];
00240 if (atomstartend.x != GRID_CELL_EMPTY) {
00241 unsigned int atomid;
00242 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00243 float4 atom = sorted_xyzr[atomid];
00244 float dx = coorx - atom.x;
00245 float dy = coory - atom.y;
00246 float dxy2 = dx*dx + dy*dy;
00247
00248 float dz = coorz - atom.z;
00249 float r21 = (dxy2 + dz*dz) * atom.w;
00250 densityval1 += __expf(r21);
00251
00252 #if GUNROLL >= 2
00253 float dz2 = dz + gridspacing;
00254 float r22 = (dxy2 + dz2*dz2) * atom.w;
00255 densityval2 += __expf(r22);
00256 #endif
00257 #if GUNROLL >= 4
00258 float dz3 = dz2 + gridspacing;
00259 float r23 = (dxy2 + dz3*dz3) * atom.w;
00260 densityval3 += __expf(r23);
00261
00262 float dz4 = dz3 + gridspacing;
00263 float r24 = (dxy2 + dz4*dz4) * atom.w;
00264 densityval4 += __expf(r24);
00265 #endif
00266 }
00267 }
00268 }
00269 }
00270 }
00271 }
00272
00273
00274
00275 #if GUNROLL == 1
00276 #define MAINDENSITYLOOP \
00277 float densityval1=0.0f; \
00278 calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00279 acoriginoffset, cellStartEnd, gridspacing, densityval1);
00280 #elif GUNROLL == 2
00281 #define MAINDENSITYLOOP \
00282 float densityval1=0.0f; \
00283 float densityval2=0.0f; \
00284 calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00285 acoriginoffset, cellStartEnd, gridspacing, \
00286 densityval1, densityval2);
00287 #elif GUNROLL == 4
00288 #define MAINDENSITYLOOP \
00289 float densityval1=0.0f; \
00290 float densityval2=0.0f; \
00291 float densityval3=0.0f; \
00292 float densityval4=0.0f; \
00293 calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00294 acoriginoffset, cellStartEnd, gridspacing, \
00295 densityval1, densityval2, densityval3, densityval4);
00296 #endif
00297
00298
00299
00300
00301
00302 __global__ static void gaussdensity_fast(int natoms,
00303 const float4 * RESTRICT sorted_xyzr,
00304 int3 volsz,
00305 int3 acncells,
00306 float3 acoriginoffset,
00307 float acgridspacing,
00308 float invacgridspacing,
00309 const uint2 * RESTRICT cellStartEnd,
00310 float gridspacing, unsigned int z,
00311 float * RESTRICT densitygrid) {
00312 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00313 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00314 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00315 unsigned int outaddr = zindex * volsz.x * volsz.y +
00316 yindex * volsz.x +
00317 xindex;
00318 zindex += z;
00319
00320
00321 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00322 return;
00323
00324
00325 int3 abmin, abmax;
00326 calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00327 gridspacing, acgridspacing, invacgridspacing);
00328
00329
00330 MAINDENSITYLOOP
00331
00332 densitygrid[outaddr ] = densityval1;
00333 #if GUNROLL >= 2
00334 int planesz = volsz.x * volsz.y;
00335 if (++zindex < volsz.z)
00336 densitygrid[outaddr + planesz] = densityval2;
00337 #endif
00338 #if GUNROLL >= 4
00339 if (++zindex < volsz.z)
00340 densitygrid[outaddr + 2*planesz] = densityval3;
00341 if (++zindex < volsz.z)
00342 densitygrid[outaddr + 3*planesz] = densityval4;
00343 #endif
00344 }
00345
00346
00347
00348
00349
00350
00351 __global__ static void gaussdensity_diff(int natoms,
00352 const float4 * RESTRICT sorted_xyzr,
00353 int3 volsz,
00354 int3 acncells,
00355 float3 acoriginoffset,
00356 float acgridspacing,
00357 float invacgridspacing,
00358 const uint2 * RESTRICT cellStartEnd,
00359 float gridspacing, unsigned int z,
00360 int3 refvolsz,
00361 int3 refoffset,
00362 const float * RESTRICT refmap,
00363 float * RESTRICT diffmap) {
00364 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00365 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00366 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00367 unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y +
00368 (yindex+refoffset.y) * refvolsz.x +
00369 (xindex+refoffset.x);
00370 unsigned int outaddr = zindex * volsz.x * volsz.y +
00371 yindex * volsz.x +
00372 xindex;
00373 zindex += z;
00374
00375
00376 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00377 return;
00378
00379
00380 int3 abmin, abmax;
00381 calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00382 gridspacing, acgridspacing, invacgridspacing);
00383
00384
00385 MAINDENSITYLOOP
00386
00387 diffmap[outaddr ] = refmap[refaddr ] - densityval1;
00388 #if GUNROLL >= 2
00389 int planesz = volsz.x * volsz.y;
00390 int refplanesz = refvolsz.x * refvolsz.y;
00391 if (++zindex < volsz.z)
00392 diffmap[outaddr + planesz] = refmap[refaddr + refplanesz] - densityval2;
00393 #endif
00394 #if GUNROLL >= 4
00395 if (++zindex < volsz.z)
00396 diffmap[outaddr + 2*planesz] = refmap[refaddr + 2*refplanesz] - densityval3;
00397 if (++zindex < volsz.z)
00398 diffmap[outaddr + 3*planesz] = refmap[refaddr + 3*refplanesz] - densityval4;
00399 #endif
00400 }
00401
00402
00403
00404
00405
00406
00407 __device__ float sumabsdiff_sumreduction(int tid, int totaltb,
00408 float *sumabsdiffs_s,
00409 float *sumabsdiffs) {
00410 float sumabsdifftotal = 0.0f;
00411
00412
00413 if (tid < warpSize) {
00414 for (int i=tid; i<totaltb; i+=warpSize) {
00415 sumabsdifftotal += sumabsdiffs[i];
00416 }
00417
00418
00419 sumabsdiffs_s[tid] = sumabsdifftotal;
00420 }
00421 __syncthreads();
00422
00423
00424
00425 for (int s=warpSize>>1; s>0; s>>=1) {
00426 if (tid < s) {
00427 sumabsdiffs_s[tid] += sumabsdiffs_s[tid + s];
00428 }
00429 __syncthreads();
00430 }
00431
00432 return sumabsdiffs_s[0];
00433 }
00434
00435
00436 __global__ static void gaussdensity_sumabsdiff(int totaltb,
00437 int natoms,
00438 const float4 * RESTRICT sorted_xyzr,
00439 int3 volsz,
00440 int3 acncells,
00441 float3 acoriginoffset,
00442 float acgridspacing,
00443 float invacgridspacing,
00444 const uint2 * RESTRICT cellStartEnd,
00445 float gridspacing,
00446 unsigned int z,
00447 int3 refvolsz,
00448 int3 refoffset,
00449 const float * RESTRICT refmap,
00450 float * RESTRICT sumabsdiff) {
00451 int tid = threadIdx.z*blockDim.x*blockDim.y +
00452 threadIdx.y*blockDim.x + threadIdx.x;
00453
00454 #if __CUDA_ARCH__ >= 200
00455
00456 __shared__ bool isLastBlockDone;
00457 if (tid == 0)
00458 isLastBlockDone = 0;
00459 __syncthreads();
00460 #endif
00461
00462 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00463 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00464 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00465 unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y +
00466 (yindex+refoffset.y) * refvolsz.x +
00467 (xindex+refoffset.x);
00468 zindex += z;
00469
00470 float thread_sumabsdiff = 0.0f;
00471
00472
00473
00474 if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
00475
00476 int3 abmin, abmax;
00477 calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00478 gridspacing, acgridspacing, invacgridspacing);
00479
00480
00481 MAINDENSITYLOOP
00482
00483 thread_sumabsdiff += fabsf(refmap[refaddr ] - densityval1);
00484 #if GUNROLL >= 2
00485 int refplanesz = refvolsz.x * refvolsz.y;
00486 if (++zindex < volsz.z)
00487 thread_sumabsdiff += fabsf(refmap[refaddr + refplanesz] - densityval2);
00488 #endif
00489 #if GUNROLL >= 4
00490 if (++zindex < volsz.z)
00491 thread_sumabsdiff += fabsf(refmap[refaddr + 2*refplanesz] - densityval3);
00492 if (++zindex < volsz.z)
00493 thread_sumabsdiff += fabsf(refmap[refaddr + 3*refplanesz] - densityval4);
00494 #endif
00495 }
00496
00497
00498 __shared__ float sumabsdiff_s[TOTALBLOCKSZ];
00499
00500 sumabsdiff_s[tid] = thread_sumabsdiff;
00501 __syncthreads();
00502
00503
00504 if (tid < warpSize) {
00505 float tmp = 0.0f;
00506 for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00507 tmp += sumabsdiff_s[i];
00508 }
00509
00510
00511 sumabsdiff_s[tid] = tmp;
00512 }
00513 __syncthreads();
00514
00515
00516
00517 for (int s=warpSize>>1; s>0; s>>=1) {
00518 if (tid < s) {
00519 sumabsdiff_s[tid] += sumabsdiff_s[tid + s];
00520 }
00521 __syncthreads();
00522 }
00523
00524
00525 if (tid == 0) {
00526 unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00527 blockIdx.y * gridDim.x + blockIdx.x;
00528 sumabsdiff[bid] = sumabsdiff_s[0];
00529 #if __CUDA_ARCH__ >= 200
00530 __threadfence();
00531
00532 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00533 isLastBlockDone = (value == (totaltb - 1));
00534 }
00535 __syncthreads();
00536 if (isLastBlockDone) {
00537 float totalsumabsdiff = sumabsdiff_sumreduction(tid, totaltb, sumabsdiff_s, sumabsdiff);
00538
00539 if (tid == 0)
00540 sumabsdiff[totaltb] = totalsumabsdiff;
00541
00542 reset_atomic_counter(&tbcatomic[0]);
00543 #else
00544 if (bid==0)
00545 sumabsdiff[totaltb] = 0.0f;
00546 #endif
00547 }
00548 }
00549
00550
00551
00552
00553
00554
00555 inline __device__ float calc_cc(float sums_ref, float sums_synth,
00556 float squares_ref, float squares_synth,
00557 int lsize, float lcc) {
00558 float cc = 0.0f;
00559
00560
00561
00562 if (lsize > 0) {
00563 float mean_ref = sums_ref / lsize;
00564 float mean_synth = sums_synth / lsize;
00565 float stddev_ref = sqrtf(squares_ref / lsize - mean_ref*mean_ref);
00566 float stddev_synth = sqrtf(squares_synth / lsize - mean_synth*mean_synth);
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576 if (stddev_ref > 0.0f && stddev_synth > 0.0f) {
00577 cc = (lcc - lsize*mean_ref*mean_synth) /
00578 (lsize * stddev_ref * stddev_synth);
00579
00580
00581
00582 if (lsize < 32)
00583 cc = 0.0f;
00584
00585
00586 cc = (cc > -1.0f) ? cc : ((cc < 1.0f) ? cc : 1.0f);
00587 } else {
00588 cc = 0.0f;
00589 }
00590 }
00591
00592 return cc;
00593 }
00594
00595
00596
00597
00598 #if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
00599
00600 inline __device__ void cc_sumreduction(int tid, int totaltb,
00601 float4 &total_cc_sums,
00602 float &total_lcc,
00603 int &total_lsize,
00604 float4 * RESTRICT tb_cc_sums,
00605 float * RESTRICT tb_lcc,
00606 int * RESTRICT tb_lsize) {
00607 total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00608 total_lcc = 0.0f;
00609 total_lsize = 0;
00610
00611
00612 if (tid < warpSize) {
00613 for (int i=tid; i<totaltb; i+=warpSize) {
00614 total_cc_sums += tb_cc_sums[i];
00615 total_lcc += tb_lcc[i];
00616 total_lsize += tb_lsize[i];
00617 }
00618
00619
00620
00621 for (int mask=warpSize/2; mask>0; mask>>=1) {
00622 total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
00623 total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
00624 total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
00625 total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
00626 total_lcc += __shfl_xor_sync(0xffffffff, total_lcc, mask);
00627 total_lsize += __shfl_xor_sync(0xffffffff, total_lsize, mask);
00628 }
00629 }
00630 }
00631 #else
00632
00633 inline __device__ void cc_sumreduction(int tid, int totaltb,
00634 float4 &total_cc_sums,
00635 float &total_lcc,
00636 int &total_lsize,
00637 float4 * RESTRICT tb_cc_sums,
00638 float * RESTRICT tb_lcc,
00639 int * RESTRICT tb_lsize) {
00640 total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00641 total_lcc = 0.0f;
00642 total_lsize = 0;
00643
00644
00645 if (tid < warpSize) {
00646 for (int i=tid; i<totaltb; i+=warpSize) {
00647 total_cc_sums += tb_cc_sums[i];
00648 total_lcc += tb_lcc[i];
00649 total_lsize += tb_lsize[i];
00650 }
00651
00652
00653 tb_cc_sums[tid] = total_cc_sums;
00654 tb_lcc[tid] = total_lcc;
00655 tb_lsize[tid] = total_lsize;
00656 }
00657 __syncthreads();
00658
00659
00660
00661 for (int s=warpSize>>1; s>0; s>>=1) {
00662 if (tid < s) {
00663 tb_cc_sums[tid] += tb_cc_sums[tid + s];
00664 tb_lcc[tid] += tb_lcc[tid + s];
00665 tb_lsize[tid] += tb_lsize[tid + s];
00666 }
00667 __syncthreads();
00668 }
00669
00670 total_cc_sums = tb_cc_sums[0];
00671 total_lcc = tb_lcc[0];
00672 total_lsize = tb_lsize[0];
00673 }
00674 #endif
00675
00676
00677 inline __device__ void thread_cc_sum(float ref, float density,
00678 float2 &thread_cc_means,
00679 float2 &thread_cc_squares,
00680 float &thread_lcc,
00681 int &thread_lsize) {
00682 if (!isnan(ref)) {
00683 thread_cc_means.x += ref;
00684 thread_cc_means.y += density;
00685 thread_cc_squares.x += ref*ref;
00686 thread_cc_squares.y += density*density;
00687 thread_lcc += ref * density;
00688 thread_lsize++;
00689 }
00690 }
00691
00692
00693
00694
00695
00696
00697 #if 0
00698 #define CKACCUM(a, b) \
00699 { \
00700 float tmp = a; \
00701 a += b; \
00702 float trunc = a - tmp; \
00703 if (b > 1e-5f && (((b - trunc) / b) > 0.01)) \
00704 printf("truncation: sum: %f incr: %f trunc: %f trunc_rem: %f\n", \
00705 a, b, trunc, (b-trunc)); \
00706 }
00707 #else
00708 #define CKACCUM(a, b) \
00709 a+=b;
00710 #endif
00711
00712
00713 __global__ static void gaussdensity_cc(int totaltb,
00714 int natoms,
00715 const float4 *sorted_xyzr,
00716 int3 volsz,
00717 int3 acncells,
00718 float3 acoriginoffset,
00719 float acgridspacing,
00720 float invacgridspacing,
00721 const uint2 * RESTRICT cellStartEnd,
00722 float gridspacing,
00723 unsigned int z,
00724 float threshold,
00725 int3 refvolsz,
00726 int3 refoffset,
00727 const float *refmap,
00728 float4 * RESTRICT tb_cc_sums,
00729 float * RESTRICT tb_lcc,
00730 int * RESTRICT tb_lsize,
00731 float * RESTRICT tb_CC) {
00732 int tid = threadIdx.z*blockDim.x*blockDim.y +
00733 threadIdx.y*blockDim.x + threadIdx.x;
00734
00735 #if __CUDA_ARCH__ >= 200
00736
00737 __shared__ bool isLastBlockDone;
00738 if (tid == 0)
00739 isLastBlockDone = 0;
00740 __syncthreads();
00741 #endif
00742
00743 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00744 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00745 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00746 unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y +
00747 (yindex+refoffset.y) * refvolsz.x +
00748 (xindex+refoffset.x);
00749 zindex += z;
00750
00751 float2 thread_cc_means = make_float2(0.0f, 0.0f);
00752 float2 thread_cc_squares = make_float2(0.0f, 0.0f);
00753 float thread_lcc = 0.0f;
00754 int thread_lsize = 0;
00755
00756
00757
00758 if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
00759
00760 int3 abmin, abmax;
00761 calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00762 gridspacing, acgridspacing, invacgridspacing);
00763
00764
00765 MAINDENSITYLOOP
00766
00767
00768
00769
00770
00771 float ref;
00772 if (densityval1 >= threshold) {
00773 ref = refmap[refaddr ];
00774 thread_cc_sum(ref, densityval1, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00775 }
00776 #if GUNROLL >= 2
00777 int refplanesz = refvolsz.x * refvolsz.y;
00778 if ((densityval2 >= threshold) && (++zindex < volsz.z)) {
00779 ref = refmap[refaddr + refplanesz];
00780 thread_cc_sum(ref, densityval2, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00781 }
00782 #endif
00783 #if GUNROLL >= 4
00784 if ((densityval3 >= threshold) && (++zindex < volsz.z)) {
00785 ref = refmap[refaddr + 2*refplanesz];
00786 thread_cc_sum(ref, densityval3, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00787 }
00788 if ((densityval4 >= threshold) && (++zindex < volsz.z)) {
00789 ref = refmap[refaddr + 3*refplanesz];
00790 thread_cc_sum(ref, densityval4, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00791 }
00792 #endif
00793 }
00794
00795
00796 #if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
00797
00798 __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
00799 __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
00800 __shared__ float tb_lcc_s[TOTALBLOCKSZ];
00801 __shared__ int tb_lsize_s[TOTALBLOCKSZ];
00802
00803 tb_cc_means_s[tid] = thread_cc_means;
00804 tb_cc_squares_s[tid] = thread_cc_squares;
00805 tb_lcc_s[tid] = thread_lcc;
00806 tb_lsize_s[tid] = thread_lsize;
00807 __syncthreads();
00808
00809
00810 if (tid < warpSize) {
00811 float2 tmp_cc_means = make_float2(0.0f, 0.0f);
00812 float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
00813 float tmp_lcc = 0.0f;
00814 int tmp_lsize = 0;
00815 for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00816 tmp_cc_means += tb_cc_means_s[i];
00817 tmp_cc_squares += tb_cc_squares_s[i];
00818 tmp_lcc += tb_lcc_s[i];
00819 tmp_lsize += tb_lsize_s[i];
00820 }
00821
00822
00823
00824 for (int mask=warpSize/2; mask>0; mask>>=1) {
00825 tmp_cc_means.x += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
00826 tmp_cc_means.y += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
00827 tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
00828 tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
00829 tmp_lcc += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
00830 tmp_lsize += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
00831 }
00832
00833
00834
00835
00836
00837
00838 if (tid == 0) {
00839 unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00840 blockIdx.y * gridDim.x + blockIdx.x;
00841
00842 tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
00843 tmp_cc_squares.x, tmp_cc_squares.y);
00844 tb_lcc[bid] = tmp_lcc;
00845 tb_lsize[bid] = tmp_lsize;
00846
00847 if (tb_CC != NULL) {
00848 float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00849 tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
00850 tb_lsize_s[0], tb_lcc_s[0]);
00851
00852
00853 tb_CC[bid] = cc;
00854 }
00855
00856 __threadfence();
00857
00858 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00859 isLastBlockDone = (value == (totaltb - 1));
00860 }
00861 }
00862 __syncthreads();
00863
00864 if (isLastBlockDone) {
00865 float4 total_cc_sums;
00866 float total_lcc;
00867 int total_lsize;
00868 cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
00869 tb_cc_sums, tb_lcc, tb_lsize);
00870
00871 if (tid == 0) {
00872 tb_cc_sums[totaltb] = total_cc_sums;
00873 tb_lcc[totaltb] = total_lcc;
00874 tb_lsize[totaltb] = total_lsize;
00875 }
00876
00877 reset_atomic_counter(&tbcatomic[0]);
00878 }
00879
00880 #else
00881
00882
00883 __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
00884 __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
00885 __shared__ float tb_lcc_s[TOTALBLOCKSZ];
00886 __shared__ int tb_lsize_s[TOTALBLOCKSZ];
00887
00888 tb_cc_means_s[tid] = thread_cc_means;
00889 tb_cc_squares_s[tid] = thread_cc_squares;
00890 tb_lcc_s[tid] = thread_lcc;
00891 tb_lsize_s[tid] = thread_lsize;
00892 __syncthreads();
00893
00894
00895 if (tid < warpSize) {
00896 float2 tmp_cc_means = make_float2(0.0f, 0.0f);
00897 float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
00898 float tmp_lcc = 0.0f;
00899 int tmp_lsize = 0;
00900 for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00901 tmp_cc_means += tb_cc_means_s[i];
00902 tmp_cc_squares += tb_cc_squares_s[i];
00903 tmp_lcc += tb_lcc_s[i];
00904 tmp_lsize += tb_lsize_s[i];
00905 }
00906
00907
00908 tb_cc_means_s[tid] = tmp_cc_means;
00909 tb_cc_squares_s[tid] = tmp_cc_squares;
00910 tb_lcc_s[tid] = tmp_lcc;
00911 tb_lsize_s[tid] = tmp_lsize;
00912 }
00913 __syncthreads();
00914
00915
00916
00917 for (int s=warpSize>>1; s>0; s>>=1) {
00918 if (tid < s) {
00919 tb_cc_means_s[tid] += tb_cc_means_s[tid + s];
00920 tb_cc_squares_s[tid] += tb_cc_squares_s[tid + s];
00921 tb_lcc_s[tid] += tb_lcc_s[tid + s];
00922 tb_lsize_s[tid] += tb_lsize_s[tid + s];
00923 }
00924 __syncthreads();
00925 }
00926
00927
00928
00929
00930
00931
00932
00933 if (tid == 0) {
00934 unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00935 blockIdx.y * gridDim.x + blockIdx.x;
00936 tb_cc_sums[bid] = make_float4(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00937 tb_cc_squares_s[0].x, tb_cc_squares_s[0].y);
00938 tb_lcc[bid] = tb_lcc_s[0];
00939 tb_lsize[bid] = tb_lsize_s[0];
00940
00941 if (tb_CC != NULL) {
00942 float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00943 tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
00944 tb_lsize_s[0], tb_lcc_s[0]);
00945
00946
00947 tb_CC[bid] = cc;
00948 }
00949
00950 #if __CUDA_ARCH__ >= 200
00951 __threadfence();
00952
00953 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00954 isLastBlockDone = (value == (totaltb - 1));
00955 }
00956 __syncthreads();
00957 if (isLastBlockDone) {
00958 float4 total_cc_sums;
00959 float total_lcc;
00960 int total_lsize;
00961 cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
00962 tb_cc_sums, tb_lcc, tb_lsize);
00963
00964 if (tid == 0) {
00965 tb_cc_sums[totaltb] = total_cc_sums;
00966 tb_lcc[totaltb] = total_lcc;
00967 tb_lsize[totaltb] = total_lsize;
00968 }
00969
00970 reset_atomic_counter(&tbcatomic[0]);
00971 #else
00972
00973 if (bid == 0) {
00974 tb_cc_sums[totaltb] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00975 tb_lcc[totaltb] = 0.0f;
00976 tb_lsize[totaltb] = 0;
00977 }
00978 #endif
00979 }
00980 #endif
00981 }
00982
00983
00984
00985
00986
00987
00988 #if 0
00989 inline __device__ void mmms_reduction(int tid, int totaltb,
00990 float4 &total_mmms,
00991 int &total_lsize,
00992 float4 * RESTRICT tb_mmms,
00993 int * RESTRICT tb_lsize) {
00994 total_mmms = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00995 total_lsize = 0;
00996
00997
00998 if (tid < warpSize) {
00999 for (int i=tid; i<totaltb; i+=warpSize) {
01000 total_mmms += tb_mmms[i];
01001 total_lsize += tb_lsize[i];
01002 }
01003
01004
01005 tb_mmms[tid] = total_mmms;
01006 tb_lsize[tid] = total_lsize;
01007 }
01008 __syncthreads();
01009
01010
01011
01012 for (int s=warpSize>>1; s>0; s>>=1) {
01013 if (tid < s) {
01014 tb_mmms[tid] += tb_mmms[tid + s];
01015 tb_lsize[tid] += tb_lsize[tid + s];
01016 }
01017 __syncthreads();
01018 }
01019
01020 total_mmms = tb_mmms[0];
01021 total_lsize = tb_lsize[0];
01022 }
01023
01024
01025 inline __device__ void thread_mmms_reduce(float val,
01026 float2 &thread_minmax,
01027 float2 &thread_mean_square,
01028 int &thread_lsize) {
01029 if (!isnan(val)) {
01030 if (thread_lsize == 0) {
01031 thread_minmax.x = val;
01032 thread_minmax.y = val;
01033 } else {
01034 thread_minmax.x = fminf(thread_minmax.x, val);
01035 thread_minmax.y = fmaxf(thread_minmax.y, val);
01036 }
01037
01038 thread_mean_square.x += val;
01039 thread_mean_square.y += val*val;
01040 thread_lsize++;
01041 }
01042 }
01043
01044
01045 __global__ static void calc_minmax_mean_stddev(int totaltb,
01046 int3 volsz,
01047 float threshold,
01048 const float * RESTRICT map,
01049 float4 * RESTRICT tb_mmms,
01050 int * RESTRICT tb_lsize) {
01051 int tid = threadIdx.z*blockDim.x*blockDim.y +
01052 threadIdx.y*blockDim.x + threadIdx.x;
01053
01054 #if __CUDA_ARCH__ >= 200
01055
01056 __shared__ bool isLastBlockDone;
01057 if (tid == 0)
01058 isLastBlockDone = 0;
01059 __syncthreads();
01060 #endif
01061
01062 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
01063 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
01064 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
01065 unsigned int mapaddr = zindex * volsz.x * volsz.y +
01066 yindex * volsz.x +
01067 xindex;
01068
01069 float2 thread_minmax = make_float2(0.0f, 0.0f);
01070 float2 thread_mean_square = make_float2(0.0f, 0.0f);
01071 int thread_lsize = 0;
01072
01073
01074 if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
01075
01076
01077
01078
01079 float val;
01080 val = map[mapaddr ];
01081 thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01082 #if GUNROLL >= 2
01083 int planesz = volsz.x * volsz.y;
01084 if (++zindex < volsz.z) {
01085 val = map[mapaddr + planesz];
01086 thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01087 }
01088 #endif
01089 #if GUNROLL >= 4
01090 if (++zindex < volsz.z) {
01091 val = map[mapaddr + 2*planesz];
01092 thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01093 }
01094 if (++zindex < volsz.z) {
01095 val = map[mapaddr + 3*planesz];
01096 thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01097 }
01098 #endif
01099 }
01100
01101
01102 __shared__ float2 tb_minmax_s[TOTALBLOCKSZ];
01103 __shared__ float2 tb_mean_square_s[TOTALBLOCKSZ];
01104 __shared__ int tb_lsize_s[TOTALBLOCKSZ];
01105
01106 tb_minmax_s[tid] = thread_minmax;
01107 tb_mean_square_s[tid] = thread_mean_square;
01108 tb_lsize_s[tid] = thread_lsize;
01109 __syncthreads();
01110
01111
01112 if (tid < warpSize) {
01113 float2 tmp_minmax = thread_minmax;
01114 float2 tmp_mean_square = make_float2(0.0f, 0.0f);
01115 int tmp_lsize = 0;
01116 for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
01117 tmp_minmax.x = fminf(tb_minmax_s[i].x, tmp_minmax.x);
01118 tmp_minmax.x = fmaxf(tb_minmax_s[i].y, tmp_minmax.y);
01119 tmp_mean_square += tb_mean_square_s[i];
01120 tmp_lsize += tb_lsize_s[i];
01121 }
01122
01123
01124 tb_minmax_s[tid] = tmp_minmax;
01125 tb_mean_square_s[tid] = tmp_mean_square;
01126 tb_lsize_s[tid] = tmp_lsize;
01127 }
01128 __syncthreads();
01129
01130
01131
01132 for (int s=warpSize>>1; s>0; s>>=1) {
01133 if (tid < s) {
01134 tb_minmax_s[tid] += tb_minmax_s[tid + s];
01135 tb_mean_square_s[tid] += tb_mean_square_s[tid + s];
01136 tb_lsize_s[tid] += tb_lsize_s[tid + s];
01137 }
01138 __syncthreads();
01139 }
01140
01141
01142
01143
01144 if (tid == 0) {
01145 unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
01146 blockIdx.y * gridDim.x + blockIdx.x;
01147 tb_mmms[bid] = make_float4(tb_minmax_s[0].x, tb_minmax_s[0].y,
01148 tb_mean_square_s[0].x, tb_mean_square_s[0].y);
01149 tb_lsize[bid] = tb_lsize_s[0];
01150 #if __CUDA_ARCH__ >= 200
01151 __threadfence();
01152
01153 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
01154 isLastBlockDone = (value == (totaltb - 1));
01155 }
01156 __syncthreads();
01157 if (isLastBlockDone) {
01158 float4 total_mmms;
01159 int total_lsize;
01160 mmms_reduction(tid, totaltb, total_mmms, total_lsize,
01161 tb_mmms, tb_lsize);
01162
01163 if (tid == 0) {
01164 tb_mmms[totaltb] = total_mmms;
01165 tb_lsize[totaltb] = total_lsize;
01166 }
01167
01168 reset_atomic_counter(&tbcatomic[0]);
01169 #else
01170
01171 if (bid == 0) {
01172 tb_mmms[totaltb] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
01173 tb_lsize[totaltb] = 0;
01174 }
01175 #endif
01176 }
01177 }
01178
01179 #endif
01180
01181
01182 static int vmd_cuda_build_accel(cudaError_t &err,
01183 const long int natoms,
01184 const int3 volsz,
01185 const float gausslim,
01186 const float radscale,
01187 const float maxrad,
01188 const float gridspacing,
01189 const float *xyzr_f,
01190 float4 *& xyzr_d,
01191 float4 *& sorted_xyzr_d,
01192 uint2 *& cellStartEnd_d,
01193 float &acgridspacing,
01194 int3 &accelcells) {
01195
01196 acgridspacing = gausslim * radscale * maxrad;
01197
01198
01199 if (getenv("VMDACGRIDSPACING") != NULL) {
01200 acgridspacing = atof(getenv("VMDACGRIDSPACING"));
01201 }
01202
01203 accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01204 accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01205 accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01206 #if 0
01207 printf(" Accel grid(%d, %d, %d) spacing %f\n",
01208 accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01209 #endif
01210
01211
01212 cudaMalloc((void**)&xyzr_d, natoms * sizeof(float4));
01213
01214
01215
01216 int i, i4;
01217 float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01218 float log2e = log2f(2.718281828f);
01219 for (i=0,i4=0; i<natoms; i++,i4+=4) {
01220 xyzr[i].x = xyzr_f[i4 ];
01221 xyzr[i].y = xyzr_f[i4 + 1];
01222 xyzr[i].z = xyzr_f[i4 + 2];
01223
01224 float scaledrad = xyzr_f[i4 + 3] * radscale;
01225 float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01226
01227 xyzr[i].w = arinv;
01228 }
01229 cudaMemcpy(xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01230 free(xyzr);
01231
01232
01233 if (vmd_cuda_build_density_atom_grid(natoms, xyzr_d, sorted_xyzr_d,
01234 cellStartEnd_d,
01235 accelcells, 1.0f / acgridspacing) != 0) {
01236 cudaFree(xyzr_d);
01237 return -1;
01238 }
01239
01240 return 0;
01241 }
01242
01243
01244
01245 static int vmd_cuda_destroy_accel(float4 *& xyzr_d,
01246 float4 *& sorted_xyzr_d,
01247 uint2 *& cellStartEnd_d) {
01248 cudaFree(xyzr_d);
01249 xyzr_d = NULL;
01250
01251 cudaFree(sorted_xyzr_d);
01252 sorted_xyzr_d = NULL;
01253
01254 cudaFree(cellStartEnd_d);
01255 cellStartEnd_d = NULL;
01256
01257 return 0;
01258 }
01259
01260
01261
01262
01263
01264
01265
01266 static void vmd_cuda_kernel_launch_dims(int3 volsz, dim3 &Bsz, dim3 &Gsz) {
01267 Bsz = dim3(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01268 Gsz = dim3((volsz.x+Bsz.x-1) / Bsz.x,
01269 (volsz.y+Bsz.y-1) / Bsz.y,
01270 (volsz.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01271
01272 #if 0
01273 printf("GPU kernel launch dimensions: Bsz: %dx%dx%d Gsz: %dx%dx%d\n",
01274 Bsz.x, Bsz.y, Bsz.z, Gsz.x, Gsz.y, Gsz.z);
01275 #endif
01276 }
01277
01278
01279 static int vmd_cuda_gaussdensity_calc(long int natoms,
01280 float3 acoriginoffset,
01281 int3 acvolsz,
01282 const float *xyzr_f,
01283 float *volmap,
01284 int3 volsz, float maxrad,
01285 float radscale, float gridspacing,
01286 float gausslim,
01287 int3 refvolsz,
01288 int3 refoffset,
01289 const float *refmap,
01290 float *diffmap, int verbose) {
01291 float4 *xyzr_d = NULL;
01292 float4 *sorted_xyzr_d = NULL;
01293 uint2 *cellStartEnd_d = NULL;
01294
01295 wkf_timerhandle globaltimer = wkf_timer_create();
01296 wkf_timer_start(globaltimer);
01297
01298 cudaError_t err;
01299 if (check_gpu_compute20(err) != 0)
01300 return -1;
01301
01302
01303
01304
01305 unsigned long volmemsz = volsz.x * volsz.y * volsz.z * sizeof(float);
01306 float *devdensity = NULL;
01307 if (cudaMalloc((void**)&devdensity, volmemsz) != cudaSuccess) {
01308 err = cudaGetLastError();
01309 return -1;
01310 }
01311
01312 #if 1
01313 float *devrefmap = NULL;
01314 float *devdiffmap = NULL;
01315 if (refmap && diffmap) {
01316 unsigned long refmemsz = refvolsz.x * refvolsz.y * refvolsz.z * sizeof(float);
01317 if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
01318 err = cudaGetLastError();
01319 return -1;
01320 }
01321 cudaMemcpy(devrefmap, refmap, refmemsz, cudaMemcpyHostToDevice);
01322
01323 if (cudaMalloc((void**)&devdiffmap, volmemsz) != cudaSuccess) {
01324 err = cudaGetLastError();
01325 return -1;
01326 }
01327 }
01328 #endif
01329
01330 int3 accelcells = make_int3(1, 1, 1);
01331 float acgridspacing = 0.0f;
01332 if (vmd_cuda_build_accel(err, natoms, acvolsz, gausslim, radscale, maxrad,
01333 gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01334 cellStartEnd_d, acgridspacing, accelcells) != 0) {
01335 return -1;
01336 }
01337
01338 double sorttime = wkf_timer_timenow(globaltimer);
01339
01340 dim3 Bsz, Gsz;
01341 vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01342
01343 gaussdensity_fast<<<Gsz, Bsz, 0>>>(natoms,
01344 sorted_xyzr_d,
01345 volsz,
01346 accelcells,
01347 acoriginoffset,
01348 acgridspacing,
01349 1.0f / acgridspacing,
01350 cellStartEnd_d,
01351 gridspacing, 0,
01352 devdensity);
01353 cudaDeviceSynchronize();
01354 double densitykerneltime = wkf_timer_timenow(globaltimer);
01355
01356 #if 1
01357 if (devrefmap && devdiffmap) {
01358 #if 1
01359 if (verbose) {
01360 printf("CUDA: refvol(%d,%d,%d) refoffset(%d,%d,%d)\n",
01361 refvolsz.x, refvolsz.y, refvolsz.z,
01362 refoffset.x, refoffset.y, refoffset.z);
01363 }
01364 #endif
01365 gaussdensity_diff<<<Gsz, Bsz, 0>>>(natoms,
01366 sorted_xyzr_d,
01367 volsz,
01368 accelcells,
01369 acoriginoffset,
01370 acgridspacing,
01371 1.0f / acgridspacing,
01372 cellStartEnd_d,
01373 gridspacing, 0,
01374 refvolsz,
01375 refoffset,
01376 devrefmap,
01377 devdiffmap);
01378 cudaDeviceSynchronize();
01379 }
01380 #endif
01381 double diffkerneltime = wkf_timer_timenow(globaltimer);
01382
01383
01384 #if 1
01385
01386
01387
01388 if (devrefmap) {
01389 unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01390 float *devsumabsdiffs = NULL;
01391 if (cudaMalloc((void**)&devsumabsdiffs, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01392 err = cudaGetLastError();
01393 return -1;
01394 }
01395 gaussdensity_sumabsdiff<<<Gsz, Bsz, 0>>>(blockcount,
01396 natoms,
01397 sorted_xyzr_d,
01398 volsz,
01399 accelcells,
01400 acoriginoffset,
01401 acgridspacing,
01402 1.0f / acgridspacing,
01403 cellStartEnd_d,
01404 gridspacing, 0,
01405 refvolsz,
01406 refoffset,
01407 devrefmap,
01408 devsumabsdiffs);
01409 cudaDeviceSynchronize();
01410
01411 float *sumabsdiffs = new float[blockcount+1];
01412 cudaMemcpy(sumabsdiffs, devsumabsdiffs, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01413 cudaFree(devsumabsdiffs);
01414
01415 #if 0
01416 printf("Map of block-wise sumabsdiffs:\n");
01417 for (int i=0; i<blockcount; i++) {
01418 printf(" tb[%d] absdiff %f\n", i, sumabsdiffs[i]);
01419 }
01420 #endif
01421
01422 #if 1
01423 float sumabsdifftotal = 0.0f;
01424 for (int j=0; j<blockcount; j++) {
01425 sumabsdifftotal += sumabsdiffs[j];
01426 }
01427 if (verbose) {
01428 printf("Sum of absolute differences: %f\n", sumabsdifftotal);
01429 printf("Kernel sum of absolute differences: %f\n", sumabsdiffs[blockcount]);
01430 }
01431 #endif
01432
01433 delete [] sumabsdiffs;
01434 }
01435 #endif
01436 double sumabsdiffkerneltime = wkf_timer_timenow(globaltimer);
01437
01438
01439 #if 1
01440
01441
01442
01443 if (devrefmap) {
01444 unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01445 float4 *dev_cc_sums = NULL;
01446 float *dev_lcc = NULL;
01447 int *dev_lsize = NULL;
01448 float *dev_CC = NULL;
01449
01450 if (cudaMalloc((void**)&dev_cc_sums, (blockcount+1) * sizeof(float4)) != cudaSuccess) {
01451 err = cudaGetLastError();
01452 return -1;
01453 }
01454
01455 if (cudaMalloc((void**)&dev_lcc, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01456 err = cudaGetLastError();
01457 return -1;
01458 }
01459
01460 if (cudaMalloc((void**)&dev_lsize, (blockcount+1) * sizeof(int)) != cudaSuccess) {
01461 err = cudaGetLastError();
01462 return -1;
01463 }
01464
01465 if (cudaMalloc((void**)&dev_CC, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01466 err = cudaGetLastError();
01467 return -1;
01468 }
01469
01470
01471 gaussdensity_cc<<<Gsz, Bsz, 0>>>(blockcount,
01472 natoms,
01473 sorted_xyzr_d,
01474 volsz,
01475 accelcells,
01476 acoriginoffset,
01477 acgridspacing,
01478 1.0f / acgridspacing,
01479 cellStartEnd_d,
01480 gridspacing, 0,
01481 -FLT_MAX,
01482 refvolsz,
01483 refoffset,
01484 devrefmap,
01485 dev_cc_sums,
01486 dev_lcc,
01487 dev_lsize,
01488 dev_CC);
01489 cudaDeviceSynchronize();
01490
01491 float4 *cc_sums = new float4[blockcount+1];
01492 cudaMemcpy(cc_sums, dev_cc_sums, (blockcount+1)*sizeof(float4), cudaMemcpyDeviceToHost);
01493 cudaFree(dev_cc_sums);
01494
01495 float *lcc = new float[blockcount+1];
01496 cudaMemcpy(lcc, dev_lcc, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01497 cudaFree(dev_lcc);
01498
01499 int *lsize = new int[blockcount+1];
01500 cudaMemcpy(lsize, dev_lsize, (blockcount+1)*sizeof(int), cudaMemcpyDeviceToHost);
01501 cudaFree(dev_lsize);
01502
01503 float *CC = new float[blockcount+1];
01504 cudaMemcpy(CC, dev_CC, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01505 cudaFree(dev_CC);
01506
01507 #if 0
01508 float4 tmp_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
01509 float tmp_lcc = 0.0f;
01510 int tmp_lsize = 0;
01511 for (int j=0; j<blockcount; j++) {
01512 tmp_cc_sums.x += cc_sums[j].x;
01513 tmp_cc_sums.y += cc_sums[j].y;
01514 tmp_cc_sums.z += cc_sums[j].z;
01515 tmp_cc_sums.w += cc_sums[j].w;
01516 tmp_lcc += lcc[j];
01517 tmp_lsize += lsize[j];
01518 }
01519
01520 if (verbose) {
01521 printf("CC sums:\n");
01522 printf(" mean_ref: %f\n", tmp_cc_sums.x);
01523 printf(" mean_syn: %f\n", tmp_cc_sums.y);
01524 printf(" stddev_ref: %f\n", tmp_cc_sums.z);
01525 printf(" stddev_syn: %f\n", tmp_cc_sums.w);
01526 printf(" lcc: %f\n", tmp_lcc);
01527 printf(" lsize: %d\n", tmp_lsize);
01528
01529 printf("Kernel CC sums:\n");
01530 printf(" mean_ref: %f\n", cc_sums[blockcount].x);
01531 printf(" mean_syn: %f\n", cc_sums[blockcount].y);
01532 printf(" stddev_ref: %f\n", cc_sums[blockcount].z);
01533 printf(" stddev_syn: %f\n", cc_sums[blockcount].w);
01534 printf(" lcc: %f\n", lcc[blockcount]);
01535 printf(" lsize: %d\n", lsize[blockcount]);
01536 }
01537 #endif
01538
01539 float mean_ref = cc_sums[blockcount].x / lsize[blockcount];
01540 float mean_synth = cc_sums[blockcount].y / lsize[blockcount];
01541 float stddev_ref = sqrtf(cc_sums[blockcount].z / lsize[blockcount] - mean_ref*mean_ref);
01542 float stddev_synth = sqrtf(cc_sums[blockcount].w / lsize[blockcount] - mean_synth*mean_synth);
01543 float cc = (lcc[blockcount] - lsize[blockcount]*mean_ref*mean_synth) /
01544 (lsize[blockcount] * stddev_ref * stddev_synth);
01545
01546 #if 1
01547 if (verbose) {
01548 printf("Final MDFF Kernel CC results:\n");
01549 printf(" mean_ref: %f\n", mean_ref);
01550 printf(" mean_synth: %f\n", mean_synth);
01551 printf(" stdev_ref: %f\n", stddev_ref);
01552 printf(" stdev_synth: %f\n", stddev_synth);
01553 printf(" CC: %f\n", cc);
01554 }
01555 #endif
01556
01557 delete [] cc_sums;
01558 delete [] lcc;
01559 delete [] lsize;
01560 }
01561 #endif
01562
01563
01564 cudaMemcpy(volmap, devdensity, volmemsz, cudaMemcpyDeviceToHost);
01565 cudaFree(devdensity);
01566 #if 1
01567 if (devrefmap && devdiffmap) {
01568 cudaMemcpy(diffmap, devdiffmap, volmemsz, cudaMemcpyDeviceToHost);
01569 cudaFree(devrefmap);
01570 cudaFree(devdiffmap);
01571 }
01572 #endif
01573
01574
01575 vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01576
01577 err = cudaGetLastError();
01578
01579 wkf_timer_stop(globaltimer);
01580 double totalruntime = wkf_timer_time(globaltimer);
01581 wkf_timer_destroy(globaltimer);
01582
01583 if (err != cudaSuccess) {
01584 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01585 return -1;
01586 }
01587
01588 if (verbose) {
01589 printf("MDFF CC GPU runtime: %.3f [sort: %.3f density %.3f diff %.3f copy: %.3f]\n",
01590 totalruntime, sorttime, densitykerneltime-sorttime,
01591 diffkerneltime-densitykerneltime, totalruntime-densitykerneltime);
01592 }
01593
01594 return 0;
01595 }
01596
01597
01598 static int vmd_cuda_cc_calc(long int natoms,
01599 float3 acoriginoffset,
01600 int3 acvolsz,
01601 const float *xyzr_f,
01602 int3 volsz, float maxrad,
01603 float radscale, float gridspacing,
01604 float gausslim,
01605 int3 refvolsz,
01606 int3 refoffset,
01607 const float *devrefmap,
01608 float ccthreshdensity,
01609 float &result_cc,
01610 const float *origin,
01611 VolumetricData **spatialccvol,
01612 int verbose) {
01613 float4 *xyzr_d = NULL;
01614 float4 *sorted_xyzr_d = NULL;
01615 uint2 *cellStartEnd_d = NULL;
01616
01617 wkf_timerhandle globaltimer = wkf_timer_create();
01618 wkf_timer_start(globaltimer);
01619
01620 if (!devrefmap)
01621 return -1;
01622
01623 cudaError_t err;
01624 if (check_gpu_compute20(err) != 0)
01625 return -1;
01626
01627 double refcopytime = wkf_timer_timenow(globaltimer);
01628
01629 int3 accelcells = make_int3(1, 1, 1);
01630 float acgridspacing = 0.0f;
01631 if (vmd_cuda_build_accel(err, natoms, acvolsz, gausslim, radscale, maxrad,
01632 gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01633 cellStartEnd_d, acgridspacing, accelcells) != 0) {
01634 return -1;
01635 }
01636
01637 double sorttime = wkf_timer_timenow(globaltimer);
01638
01639 dim3 Bsz, Gsz;
01640 vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01641
01642
01643
01644
01645 unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01646 float4 *dev_cc_sums = NULL;
01647 float *dev_lcc = NULL;
01648 int *dev_lsize = NULL;
01649 float *dev_tbCC = NULL;
01650
01651 cudaMalloc((void**)&dev_cc_sums, (blockcount+1) * sizeof(float4));
01652 cudaMalloc((void**)&dev_lcc, (blockcount+1) * sizeof(float));
01653
01654 if (spatialccvol != NULL) {
01655 cudaMalloc((void**)&dev_tbCC, (blockcount+1) * sizeof(float));
01656 }
01657
01658 if (cudaMalloc((void**)&dev_lsize, (blockcount+1) * sizeof(int)) != cudaSuccess) {
01659 err = cudaGetLastError();
01660 return -1;
01661 }
01662
01663 gaussdensity_cc<<<Gsz, Bsz, 0>>>(blockcount,
01664 natoms,
01665 sorted_xyzr_d,
01666 volsz,
01667 accelcells,
01668 acoriginoffset,
01669 acgridspacing,
01670 1.0f / acgridspacing,
01671 cellStartEnd_d,
01672 gridspacing, 0,
01673 ccthreshdensity,
01674 refvolsz,
01675 refoffset,
01676 devrefmap,
01677 dev_cc_sums,
01678 dev_lcc,
01679 dev_lsize,
01680 dev_tbCC);
01681 cudaDeviceSynchronize();
01682 double cctime = wkf_timer_timenow(globaltimer);
01683
01684 float4 *cc_sums = new float4[blockcount+1];
01685 cudaMemcpy(cc_sums, dev_cc_sums, (blockcount+1)*sizeof(float4), cudaMemcpyDeviceToHost);
01686 cudaFree(dev_cc_sums);
01687
01688 float *lcc = new float[blockcount+1];
01689 cudaMemcpy(lcc, dev_lcc, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01690 cudaFree(dev_lcc);
01691
01692 int *lsize = new int[blockcount+1];
01693 cudaMemcpy(lsize, dev_lsize, (blockcount+1)*sizeof(int), cudaMemcpyDeviceToHost);
01694 cudaFree(dev_lsize);
01695
01696 float mean_ref = cc_sums[blockcount].x / lsize[blockcount];
01697 float mean_synth = cc_sums[blockcount].y / lsize[blockcount];
01698 float stddev_ref = sqrtf(cc_sums[blockcount].z / lsize[blockcount] - mean_ref*mean_ref);
01699 float stddev_synth = sqrtf(cc_sums[blockcount].w / lsize[blockcount] - mean_synth*mean_synth);
01700
01701 float cc = 0.0f;
01702
01703
01704
01705 if (lsize[blockcount] > 0) {
01706 if (stddev_ref == 0.0f || stddev_synth == 0.0f) {
01707 printf("WARNING: Ill-conditioned CC calc. due to zero std. deviation:\n");
01708 printf("WARNING: stddev_ref: %f stddev_synth: %f\n", stddev_ref, stddev_synth);
01709 cc = 0.0f;
01710 } else {
01711 cc = (lcc[blockcount] - lsize[blockcount]*mean_ref*mean_synth) /
01712 (lsize[blockcount] * stddev_ref * stddev_synth);
01713 }
01714 }
01715
01716 if (spatialccvol != NULL) {
01717 char mapname[64] = { "mdff spatial CC map" };
01718 float spxaxis[3] = {1.0f, 0.0f, 0.0f};
01719 float spyaxis[3] = {0.0f, 1.0f, 0.0f};
01720 float spzaxis[3] = {0.0f, 0.0f, 1.0f};
01721 spxaxis[0] = Bsz.x * Gsz.x * gridspacing;
01722 spyaxis[1] = Bsz.y * Gsz.y * gridspacing;
01723 spzaxis[2] = Bsz.z * Gsz.z * gridspacing * GUNROLL;
01724
01725 if (verbose) {
01726 printf("Spatial CC map: blockcount[%d] x: %d y: %d z: %d cnt: %d\n",
01727 blockcount, Gsz.x, Gsz.y, Gsz.z, (Gsz.x*Gsz.y*Gsz.z));
01728 }
01729
01730 float *spatialccmap = new float[blockcount];
01731 cudaMemcpy(spatialccmap, dev_tbCC, blockcount*sizeof(float), cudaMemcpyDeviceToHost);
01732 cudaFree(dev_tbCC);
01733
01734 *spatialccvol = new VolumetricData(mapname, origin,
01735 spxaxis, spyaxis, spzaxis,
01736 Gsz.x, Gsz.y, Gsz.z, spatialccmap);
01737 }
01738
01739 #if 0
01740 if (verbose) {
01741 printf("Final MDFF Kernel CC results:\n");
01742 printf(" GPU CC sum data:\n");
01743 printf(" mean_ref: %f\n", cc_sums[blockcount].x);
01744 printf(" mean_synth: %f\n", cc_sums[blockcount].y);
01745 printf(" squares_ref: %f\n", cc_sums[blockcount].z);
01746 printf(" squares_synth: %f\n", cc_sums[blockcount].w);
01747 printf(" lcc: %f\n", lcc[blockcount]);
01748 printf(" lsize: %d\n", lsize[blockcount]);
01749 printf(" Reduced/finalized GPU CC data:\n");
01750 printf(" mean_ref: %f\n", mean_ref);
01751 printf(" mean_synth: %f\n", mean_synth);
01752 printf(" stdev_ref: %f\n", stddev_ref);
01753 printf(" stdev_synth: %f\n", stddev_synth);
01754 printf(" voxels used: %d (total %d)\n",
01755 lsize[blockcount], volsz.x*volsz.y*volsz.z);
01756 printf(" CC: %f\n", cc);
01757 }
01758 #endif
01759
01760 result_cc = cc;
01761
01762 delete [] cc_sums;
01763 delete [] lcc;
01764 delete [] lsize;
01765
01766
01767 vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01768
01769 err = cudaGetLastError();
01770
01771 wkf_timer_stop(globaltimer);
01772 double totalruntime = wkf_timer_time(globaltimer);
01773 wkf_timer_destroy(globaltimer);
01774
01775 if (err != cudaSuccess) {
01776 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01777 return -1;
01778 }
01779
01780 if (verbose) {
01781 printf("MDFF CC GPU runtime: %.3f [refcopy: %.3f sort: %.3f cc: %.3f copy: %.3f]\n",
01782 totalruntime, refcopytime, sorttime-refcopytime, cctime-sorttime, totalruntime-cctime);
01783 }
01784
01785 return 0;
01786 }
01787
01788
01789 #if 0
01790 static int vmd_cuda_gaussdensity_sumabsdiff(long int natoms,
01791 const float *xyzr_f,
01792 float *volmap, int3 volsz, float maxrad,
01793 float radscale, float gridspacing,
01794 float gausslim) {
01795 float4 *xyzr_d = NULL;
01796 float4 *sorted_xyzr_d = NULL;
01797 uint2 *cellStartEnd_d = NULL;
01798
01799 wkf_timerhandle globaltimer = wkf_timer_create();
01800 wkf_timer_start(globaltimer);
01801
01802 cudaError_t err;
01803 if (check_gpu_compute20(err) != 0)
01804 return -1;
01805
01806
01807
01808
01809 unsigned long volmemsz = volsz.x * volsz.y * volsz.z * sizeof(float);
01810 float *devdensity = NULL;
01811 if (cudaMalloc((void**)&devdensity, volmemsz) != cudaSuccess) {
01812 err = cudaGetLastError();
01813 return -1;
01814 }
01815
01816 int3 accelcells = make_int3(1, 1, 1);
01817 float acgridspacing = 0.0f;
01818 if (vmd_cuda_build_accel(err, natoms, volsz, gausslim, radscale, maxrad,
01819 gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01820 cellStartEnd_d, acgridspacing, accelcells) != 0) {
01821 return -1;
01822 }
01823
01824 double sorttime = wkf_timer_timenow(globaltimer);
01825
01826 dim3 Bsz, Gsz;
01827 vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01828
01829 #if 0
01830 int3 refoffset = make_int3(0, 0, 0);
01831 gaussdensity_sumabsdiff<<<Gsz, Bsz, 0>>>(natoms,
01832 sorted_xyzr_d,
01833 volsz,
01834 accelcells,
01835 acgridspacing,
01836 1.0f / acgridspacing,
01837 cellStartEnd_d,
01838 gridspacing, 0,
01839 refoffset,
01840 #if 1
01841 NULL, NULL);
01842 #else
01843 densitygrid,
01844 densitysumabsdiff);
01845 #endif
01846 #endif
01847
01848 cudaDeviceSynchronize();
01849 double densitykerneltime = wkf_timer_timenow(globaltimer);
01850 cudaMemcpy(volmap, devdensity, volmemsz, cudaMemcpyDeviceToHost);
01851 cudaFree(devdensity);
01852
01853
01854 vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01855
01856 err = cudaGetLastError();
01857
01858 wkf_timer_stop(globaltimer);
01859 double totalruntime = wkf_timer_time(globaltimer);
01860 wkf_timer_destroy(globaltimer);
01861
01862 if (err != cudaSuccess) {
01863 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01864 return -1;
01865 }
01866
01867 printf("MDFF absdiff GPU runtime: %.3f [sort: %.3f density %.3f copy: %.3f]\n",
01868 totalruntime, sorttime, densitykerneltime-sorttime,
01869 totalruntime-densitykerneltime);
01870
01871 return 0;
01872 }
01873 #endif
01874
01875
01876 static float gausslim_quality(int quality) {
01877
01878 float gausslim = 2.0f;
01879 switch (quality) {
01880 case 3: gausslim = 4.0f; break;
01881
01882 case 2: gausslim = 3.0f; break;
01883
01884 case 1: gausslim = 2.5f; break;
01885
01886 case 0:
01887 default: gausslim = 2.0f;
01888 break;
01889 }
01890
01891 return gausslim;
01892 }
01893
01894
01895
01896 static int calc_density_bounds(const AtomSel *sel, MoleculeList *mlist,
01897 int verbose,
01898 int quality, float radscale, float gridspacing,
01899 float &maxrad, float *origin,
01900 float *xaxis, float *yaxis, float *zaxis,
01901 int3 & volsz) {
01902 Molecule *m = mlist->mol_from_id(sel->molid());
01903 const float *atompos = sel->coordinates(mlist);
01904 const float *atomradii = m->extraflt.data("radius");
01905
01906 vec_zero(origin);
01907 vec_zero(xaxis);
01908 vec_zero(yaxis);
01909 vec_zero(zaxis);
01910
01911
01912 float minrad = 1.0f;
01913 maxrad = 1.0f;
01914 m->get_radii_minmax(minrad, maxrad);
01915
01916 float fmin[3], fmax[3];
01917 minmax_selected_3fv_aligned(atompos, sel->on, sel->num_atoms,
01918 sel->firstsel, sel->lastsel, fmin, fmax);
01919
01920 float minx, miny, minz, maxx, maxy, maxz;
01921 minx = fmin[0]; miny = fmin[1]; minz = fmin[2];
01922 maxx = fmax[0]; maxy = fmax[1]; maxz = fmax[2];
01923
01924 if (verbose) {
01925 printf("DenBBox: rminmax %f %f min: %f %f %f max: %f %f %f\n",
01926 minrad, maxrad, minx, miny, minz, maxx, maxy, maxz);
01927 }
01928
01929 float mincoord[3], maxcoord[3];
01930 mincoord[0] = minx; mincoord[1] = miny; mincoord[2] = minz;
01931 maxcoord[0] = maxx; maxcoord[1] = maxy; maxcoord[2] = maxz;
01932
01933
01934
01935 float gridpadding = radscale * maxrad * 1.70f;
01936 float padrad = gridpadding;
01937 padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad);
01938 gridpadding = MAX(gridpadding, padrad);
01939
01940 if (verbose) {
01941 printf("MDFFden: R*%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n",
01942 radscale, gridspacing, gridpadding, minrad, maxrad);
01943 }
01944
01945 mincoord[0] -= gridpadding;
01946 mincoord[1] -= gridpadding;
01947 mincoord[2] -= gridpadding;
01948 maxcoord[0] += gridpadding;
01949 maxcoord[1] += gridpadding;
01950 maxcoord[2] += gridpadding;
01951
01952
01953 volsz.x = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing);
01954 volsz.y = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing);
01955 volsz.z = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing);
01956
01957
01958 xaxis[0] = (volsz.x-1) * gridspacing;
01959 yaxis[1] = (volsz.y-1) * gridspacing;
01960 zaxis[2] = (volsz.z-1) * gridspacing;
01961 maxcoord[0] = mincoord[0] + xaxis[0];
01962 maxcoord[1] = mincoord[1] + yaxis[1];
01963 maxcoord[2] = mincoord[2] + zaxis[2];
01964
01965 if (verbose) {
01966 printf(" GridSZ: (%4d %4d %4d) BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n",
01967 volsz.x, volsz.y, volsz.z,
01968 mincoord[0], mincoord[1], mincoord[2],
01969 maxcoord[0], maxcoord[1], maxcoord[2]);
01970 }
01971
01972 vec_copy(origin, mincoord);
01973 return 0;
01974 }
01975
01976
01977 static int map_uniform_spacing(double xax, double yax, double zax,
01978 int szx, int szy, int szz) {
01979
01980 double xdelta = xax / (szx - 1);
01981 double ydelta = yax / (szy - 1);
01982 double zdelta = zax / (szz - 1);
01983
01984 if ((xdelta == ydelta) && (ydelta == zdelta))
01985 return 1;
01986
01987
01988
01989 const double relerrlim = 1e-7;
01990 double dxydelta = fabs(xdelta-ydelta);
01991 double dyzdelta = fabs(ydelta-zdelta);
01992 double dxzdelta = fabs(xdelta-zdelta);
01993 if (((dxydelta / xdelta) < relerrlim) && ((dxydelta / xdelta) < relerrlim) &&
01994 ((dyzdelta / ydelta) < relerrlim) && ((dyzdelta / zdelta) < relerrlim) &&
01995 ((dxzdelta / xdelta) < relerrlim) && ((dxzdelta / zdelta) < relerrlim))
01996 return 1;
01997
01998 return 0;
01999 }
02000
02001
02002 static int calc_density_bounds_overlapping_map(int verbose, float &gridspacing,
02003 float *origin, float *xaxis, float *yaxis, float *zaxis,
02004 int3 &volsz, int3 &refoffset,
02005 const VolumetricData * refmap) {
02006
02007 float xdelta = xaxis[0] / (volsz.x - 1);
02008 float ydelta = yaxis[1] / (volsz.y - 1);
02009 float zdelta = zaxis[2] / (volsz.z - 1);
02010
02011 if (verbose) {
02012 printf("calc_overlap: delta %f %f %f, gs: %f vsz: %d %d %d\n",
02013 xdelta, ydelta, zdelta, gridspacing, volsz.x, volsz.y, volsz.z);
02014
02015 if (gridspacing != xdelta) {
02016 printf("calc_overlap: WARNING grid spacing != ref map spacing: (%f != %f)\n", gridspacing, xdelta);
02017 }
02018 }
02019
02020 float refxdelta = float(refmap->xaxis[0] / (refmap->xsize - 1));
02021 float refydelta = float(refmap->yaxis[1] / (refmap->ysize - 1));
02022 float refzdelta = float(refmap->zaxis[2] / (refmap->zsize - 1));
02023 if (verbose) {
02024 printf("calc_overlap: refdelta %f %f %f, gs: %f vsz: %d %d %d\n",
02025 refxdelta, refydelta, refzdelta, gridspacing,
02026 refmap->xsize, refmap->ysize, refmap->zsize);
02027 }
02028
02029
02030 float fvoxorigin[3];
02031 fvoxorigin[0] = float((origin[0] - refmap->origin[0]) / refxdelta);
02032 fvoxorigin[1] = float((origin[1] - refmap->origin[1]) / refydelta);
02033 fvoxorigin[2] = float((origin[2] - refmap->origin[2]) / refzdelta);
02034
02035 refoffset.x = int((fvoxorigin[0] < 0) ? 0 : floor(fvoxorigin[0]));
02036 refoffset.y = int((fvoxorigin[1] < 0) ? 0 : floor(fvoxorigin[1]));
02037 refoffset.z = int((fvoxorigin[2] < 0) ? 0 : floor(fvoxorigin[2]));
02038 if (verbose) {
02039 printf("calc_overlap: refoffset: %d %d %d\n",
02040 refoffset.x, refoffset.y, refoffset.z);
02041 }
02042
02043 float maxcorner[3];
02044 maxcorner[0] = origin[0] + xaxis[0];
02045 maxcorner[1] = origin[1] + yaxis[1];
02046 maxcorner[2] = origin[2] + zaxis[2];
02047
02048 float refmaxcorner[3];
02049 refmaxcorner[0] = float(refmap->origin[0] + refmap->xaxis[0]);
02050 refmaxcorner[1] = float(refmap->origin[1] + refmap->yaxis[1]);
02051 refmaxcorner[2] = float(refmap->origin[2] + refmap->zaxis[2]);
02052
02053 maxcorner[0] = (maxcorner[0] > refmaxcorner[0]) ? refmaxcorner[0] : maxcorner[0];
02054 maxcorner[1] = (maxcorner[1] > refmaxcorner[1]) ? refmaxcorner[1] : maxcorner[1];
02055 maxcorner[2] = (maxcorner[2] > refmaxcorner[2]) ? refmaxcorner[2] : maxcorner[2];
02056
02057 origin[0] = float(refmap->origin[0] + refoffset.x * refxdelta);
02058 origin[1] = float(refmap->origin[1] + refoffset.y * refydelta);
02059 origin[2] = float(refmap->origin[2] + refoffset.z * refzdelta);
02060
02061 volsz.x = (int) round((maxcorner[0] - origin[0]) / refxdelta)+1;
02062 volsz.y = (int) round((maxcorner[1] - origin[1]) / refydelta)+1;
02063 volsz.z = (int) round((maxcorner[2] - origin[2]) / refzdelta)+1;
02064
02065
02066 xaxis[0] = ((volsz.x-1) * refxdelta);
02067 yaxis[1] = ((volsz.y-1) * refydelta);
02068 zaxis[2] = ((volsz.z-1) * refzdelta);
02069
02070 return 0;
02071 }
02072
02073
02074
02075 static float *build_xyzr_from_coords(const AtomSel *sel, const float *atompos,
02076 const float *atomradii, const float *origin) {
02077
02078 int ind = sel->firstsel * 3;
02079 int ind4=0;
02080
02081 float *xyzr = (float *) malloc(sel->selected * sizeof(float) * 4);
02082
02083
02084 int i;
02085 for (i=sel->firstsel; i <= sel->lastsel; i++) {
02086 if (sel->on[i]) {
02087 const float *fp = atompos + ind;
02088 xyzr[ind4 ] = fp[0]-origin[0];
02089 xyzr[ind4 + 1] = fp[1]-origin[1];
02090 xyzr[ind4 + 2] = fp[2]-origin[2];
02091 xyzr[ind4 + 3] = atomradii[i];
02092 ind4 += 4;
02093 }
02094 ind += 3;
02095 }
02096
02097 return xyzr;
02098 }
02099
02100
02101
02102 static float *build_xyzr_from_sel(const AtomSel *sel, MoleculeList *mlist,
02103 const float *origin) {
02104 Molecule *m = mlist->mol_from_id(sel->molid());
02105 const float *atompos = sel->coordinates(mlist);
02106 const float *atomradii = m->extraflt.data("radius");
02107
02108 float *xyzr = build_xyzr_from_coords(sel, atompos, atomradii, origin);
02109
02110 return xyzr;
02111 }
02112
02113
02114
02115
02116
02117
02118 int vmd_cuda_calc_density(const AtomSel *sel, MoleculeList *mlist,
02119 int quality, float radscale, float gridspacing,
02120 VolumetricData ** synthvol,
02121 const VolumetricData * refmap,
02122 VolumetricData ** diffvol,
02123 int verbose) {
02124 wkf_timerhandle timer = wkf_timer_create();
02125 wkf_timer_start(timer);
02126
02127 float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02128 int3 volsz = make_int3(1, 1, 1);
02129 int3 refsz = make_int3(1, 1, 1);
02130 int3 acvolsz = make_int3(1, 1, 1);
02131 int3 refoffset = make_int3(0, 0, 0);
02132
02133 if (refmap) {
02134 gridspacing = float(refmap->xaxis[0] / (refmap->xsize - 1));
02135 if (verbose) {
02136 printf("refmap gridspacing: %f\n", gridspacing);
02137 }
02138 }
02139
02140 calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02141 maxrad, origin, xaxis, yaxis, zaxis, volsz);
02142 vec_copy(acorigin, origin);
02143 acvolsz = volsz;
02144
02145 if (verbose) {
02146 printf("dmap init orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02147 origin[0], origin[1], origin[2],
02148 xaxis[0], yaxis[1], zaxis[2],
02149 volsz.x, volsz.y, volsz.z);
02150 }
02151
02152 if (refmap) {
02153 calc_density_bounds_overlapping_map(verbose, gridspacing, origin,
02154 xaxis, yaxis, zaxis, volsz,
02155 refoffset, refmap);
02156
02157 if (verbose) {
02158 printf("ref dvol orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02159 refmap->origin[0], refmap->origin[1], refmap->origin[2],
02160 refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2],
02161 refmap->xsize, refmap->ysize, refmap->zsize);
02162
02163 printf("dmap rmap orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02164 origin[0], origin[1], origin[2],
02165 xaxis[0], yaxis[1], zaxis[2],
02166 volsz.x, volsz.y, volsz.z);
02167 }
02168 }
02169
02170 float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02171 origin[1] - acorigin[1],
02172 origin[2] - acorigin[2]);
02173
02174 float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02175 float gausslim = gausslim_quality(quality);
02176
02177 double pretime = wkf_timer_timenow(timer);
02178 float *volmap = new float[volsz.x*volsz.y*volsz.z];
02179
02180 int rc=0;
02181 if (!diffvol) {
02182 rc = vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz,
02183 xyzr, volmap, volsz,
02184 maxrad, radscale, gridspacing, gausslim,
02185 refsz, refoffset, NULL, NULL,
02186 verbose);
02187
02188 if (rc) {
02189 free(xyzr);
02190 delete [] volmap;
02191 return -1;
02192 }
02193 } else {
02194
02195 float *diffmap = new float[volsz.x*volsz.y*volsz.z];
02196
02197 refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02198 rc = vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz,
02199 xyzr, volmap, volsz,
02200 maxrad, radscale, gridspacing, gausslim,
02201 refsz, refoffset, refmap->data, diffmap,
02202 verbose);
02203
02204
02205 if (rc) {
02206 free(xyzr);
02207 delete [] volmap;
02208 delete [] diffmap;
02209 return -1;
02210 }
02211
02212 char diffdataname[64] = { "mdff density difference map" };
02213 *diffvol = new VolumetricData(diffdataname, origin, xaxis, yaxis, zaxis,
02214 volsz.x, volsz.y, volsz.z, diffmap);
02215 }
02216
02217 char dataname[64] = { "mdff synthetic density map" };
02218 *synthvol = new VolumetricData(dataname, origin, xaxis, yaxis, zaxis,
02219 volsz.x, volsz.y, volsz.z, volmap);
02220
02221 double voltime = wkf_timer_timenow(timer);
02222 free(xyzr);
02223
02224 if (verbose) {
02225 char strmsg[1024];
02226 sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f vol:%.3f]",
02227 voltime, pretime, voltime-pretime);
02228 msgInfo << strmsg << sendmsg;
02229 }
02230 wkf_timer_destroy(timer);
02231
02232 return 0;
02233 }
02234
02235
02236
02237 #if 0
02238 static int vmd_calc_cc(const AtomSel *sel, MoleculeList *mlist,
02239 int quality, float radscale, float gridspacing,
02240 const VolumetricData * refmap,
02241 int verbose, float ccthreshdensity, float &result_cc) {
02242 wkf_timerhandle timer = wkf_timer_create();
02243 wkf_timer_start(timer);
02244
02245 cudaError_t err = cudaGetLastError();
02246 if (err != cudaSuccess)
02247 return -1;
02248
02249 if (!refmap)
02250 return -1;
02251
02252 float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02253 int3 volsz = make_int3(1, 1, 1);
02254 int3 refsz = make_int3(1, 1, 1);
02255 int3 acvolsz = make_int3(1, 1, 1);
02256 int3 refoffset = make_int3(0, 0, 0);
02257
02258 gridspacing = refmap->xaxis[0] / (refmap->xsize - 1);
02259 calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02260 maxrad, origin, xaxis, yaxis, zaxis, volsz);
02261 vec_copy(acorigin, origin);
02262 acvolsz = volsz;
02263
02264 calc_density_bounds_overlapping_map(verbose, gridspacing, origin,
02265 xaxis, yaxis, zaxis, volsz,
02266 refoffset, refmap);
02267
02268 float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02269 origin[1] - acorigin[1],
02270 origin[2] - acorigin[2]);
02271
02272 float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02273 float gausslim = gausslim_quality(quality);
02274
02275 double pretime = wkf_timer_timenow(timer);
02276 refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02277
02278 if (refoffset.x >= refsz.x || refoffset.y >= refsz.y || refoffset.z >= refsz.z) {
02279 printf("MDFF CC: no overlap between synthetic map and reference map!\n");
02280 return -1;
02281 }
02282
02283
02284
02285 float *devrefmap = NULL;
02286 unsigned long refmemsz = refsz.x * refsz.y * refsz.z * sizeof(float);
02287 if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
02288 err = cudaGetLastError();
02289 return -1;
02290 }
02291 cudaMemcpy(devrefmap, refmap->data, refmemsz, cudaMemcpyHostToDevice);
02292
02293 result_cc = 0.0f;
02294 vmd_cuda_cc_calc(sel->selected, acoriginoffset, acvolsz, xyzr,
02295 volsz, maxrad, radscale, gridspacing,
02296 gausslim, refsz, refoffset, devrefmap,
02297 ccthreshdensity, result_cc, origin, NULL,
02298 verbose);
02299 cudaFree(devrefmap);
02300
02301 double cctime = wkf_timer_timenow(timer);
02302 free(xyzr);
02303
02304 if (verbose) {
02305 char strmsg[1024];
02306 sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f cc:%.3f]",
02307 cctime, pretime, cctime-pretime);
02308 msgInfo << strmsg << sendmsg;
02309 }
02310 wkf_timer_destroy(timer);
02311
02312 return 0;
02313 }
02314
02315 #endif
02316
02317
02318
02319
02320
02321
02322
02323
02324 int vmd_cuda_compare_sel_refmap(const AtomSel *sel, MoleculeList *mlist,
02325 int quality, float radscale, float gridspacing,
02326 const VolumetricData * refmap,
02327 VolumetricData **synthvol,
02328 VolumetricData **diffvol,
02329 VolumetricData **spatialccvol,
02330 float *CC, float ccthreshdensity,
02331 int verbose) {
02332 wkf_timerhandle timer = wkf_timer_create();
02333 wkf_timer_start(timer);
02334
02335 if (!map_uniform_spacing(refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2],
02336 refmap->xsize, refmap->ysize, refmap->zsize)) {
02337 if (verbose)
02338 printf("mdffi cc: non-uniform grid spacing unimplemented on GPU, falling back to CPU!\n");
02339
02340 return -1;
02341 }
02342
02343 cudaError_t err = cudaGetLastError();
02344 if (err != cudaSuccess)
02345 return -1;
02346
02347 if (!refmap)
02348 return -1;
02349
02350 float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02351 int3 volsz = make_int3(1, 1, 1);
02352 int3 refsz = make_int3(1, 1, 1);
02353 int3 acvolsz = make_int3(1, 1, 1);
02354 int3 refoffset = make_int3(0, 0, 0);
02355
02356 if (verbose) {
02357 printf("vmd_cuda_compare_sel_refmap():\n");
02358 printf(" refmap xaxis: %f %f %f ",
02359 refmap->xaxis[0],
02360 refmap->xaxis[1],
02361 refmap->xaxis[2]);
02362 printf(" refmap size: %d %d %d\n",
02363 refmap->xsize, refmap->ysize, refmap->zsize);
02364 printf(" gridspacing (orig): %f ", gridspacing);
02365 }
02366 gridspacing = float(refmap->xaxis[0] / (refmap->xsize - 1));
02367 if (verbose) {
02368 printf("(new): %f\n", gridspacing);
02369 }
02370
02371
02372
02373 if (gridspacing == 0.0) {
02374 if (verbose)
02375 printf("GPU gridspacing is zero! bailing out!\n");
02376 return -1;
02377 }
02378
02379 calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02380 maxrad, origin, xaxis, yaxis, zaxis, volsz);
02381 vec_copy(acorigin, origin);
02382 acvolsz = volsz;
02383
02384 calc_density_bounds_overlapping_map(verbose, gridspacing, origin,
02385 xaxis, yaxis, zaxis, volsz,
02386 refoffset, refmap);
02387
02388 float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02389 origin[1] - acorigin[1],
02390 origin[2] - acorigin[2]);
02391
02392 float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02393 float gausslim = gausslim_quality(quality);
02394
02395 double pretime = wkf_timer_timenow(timer);
02396 refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02397
02398 if (refoffset.x >= refsz.x || refoffset.y >= refsz.y || refoffset.z >= refsz.z) {
02399 printf("MDFF CC: no overlap between synthetic map and reference map!\n");
02400 return -1;
02401 }
02402
02403
02404
02405 float *devrefmap = NULL;
02406 unsigned long refmemsz = refsz.x * refsz.y * refsz.z * sizeof(float);
02407 if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
02408 err = cudaGetLastError();
02409 return -1;
02410 }
02411 cudaMemcpy(devrefmap, refmap->data, refmemsz, cudaMemcpyHostToDevice);
02412
02413 if (CC != NULL) {
02414 vmd_cuda_cc_calc(sel->selected, acoriginoffset, acvolsz, xyzr,
02415 volsz, maxrad, radscale, gridspacing,
02416 gausslim, refsz, refoffset, devrefmap,
02417 ccthreshdensity, *CC, origin, spatialccvol,
02418 verbose);
02419 }
02420
02421
02422 float *synthmap = NULL;
02423 float *diffmap = NULL;
02424 if (synthvol != NULL || diffvol != NULL) {
02425 if (synthvol != NULL)
02426 synthmap = new float[volsz.x*volsz.y*volsz.z];
02427
02428 if (diffvol != NULL)
02429 diffmap = new float[volsz.x*volsz.y*volsz.z];
02430
02431 vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz, xyzr,
02432 synthmap, volsz, maxrad, radscale, gridspacing,
02433 gausslim, refsz, refoffset, refmap->data, diffmap,
02434 verbose);
02435 }
02436
02437
02438 if (synthmap != NULL) {
02439 char mapname[64] = { "mdff synthetic density map" };
02440 *synthvol = new VolumetricData(mapname, origin, xaxis, yaxis, zaxis,
02441 volsz.x, volsz.y, volsz.z, synthmap);
02442 }
02443
02444 if (diffmap != NULL) {
02445 char mapname[64] = { "MDFF density difference map" };
02446 *diffvol = new VolumetricData(mapname, origin, xaxis, yaxis, zaxis,
02447 volsz.x, volsz.y, volsz.z, diffmap);
02448 }
02449
02450 cudaFree(devrefmap);
02451
02452 double cctime = wkf_timer_timenow(timer);
02453 free(xyzr);
02454
02455 if (verbose) {
02456 char strmsg[1024];
02457 sprintf(strmsg, "MDFF comp: %.3f [pre:%.3f cc:%.3f]",
02458 cctime, pretime, cctime-pretime);
02459 msgInfo << strmsg << sendmsg;
02460 }
02461 wkf_timer_destroy(timer);
02462
02463 return 0;
02464 }
02465
02466
02467 #if 0
02468
02469 static int vmd_test_sumabsdiff(const AtomSel *sel, MoleculeList *mlist,
02470 int quality, float radscale, float gridspacing,
02471 const VolumetricData * refmap, int verbose) {
02472 wkf_timerhandle timer = wkf_timer_create();
02473 wkf_timer_start(timer);
02474
02475 float maxrad, origin[3], xaxis[3], yaxis[3], zaxis[3];
02476 int3 volsz = make_int3(1, 1, 1);
02477 int3 refoffset = make_int3(0, 0, 0);
02478
02479 calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02480 maxrad, origin, xaxis, yaxis, zaxis, volsz);
02481
02482 if (refmap) {
02483 printf("dmap init orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02484 origin[0], origin[1], origin[2],
02485 xaxis[0], yaxis[1], zaxis[2],
02486 volsz.x, volsz.y, volsz.z);
02487
02488 calc_density_bounds_overlapping_map(1, gridspacing, origin,
02489 xaxis, yaxis, zaxis, volsz,
02490 refoffset, refmap);
02491
02492 printf("ref dvol orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02493 refmap->origin[0], refmap->origin[1], refmap->origin[2],
02494 refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2],
02495 refmap->xsize, refmap->ysize, refmap->zsize);
02496
02497 printf("dmap rmap orig: %f %f %f axes: %f %f %f sz: %d %d %d\n",
02498 origin[0], origin[1], origin[2],
02499 xaxis[0], yaxis[1], zaxis[2],
02500 volsz.x, volsz.y, volsz.z);
02501 }
02502
02503 float *xyzr = build_xyzr_from_sel(sel, mlist, origin);
02504 float gausslim = gausslim_quality(quality);
02505
02506 double pretime = wkf_timer_timenow(timer);
02507
02508 #if 0
02509 float *volmap = new float[volsz.x*volsz.y*volsz.z];
02510 char dataname[64] = { "mdff synthetic density map" };
02511
02512 vmd_cuda_gaussdensity_calc(sel->selected, xyzr, volmap, volsz,
02513 maxrad, radscale, gridspacing, gausslim, verbose);
02514
02515 *synthvol = new VolumetricData(dataname, origin, xaxis, yaxis, zaxis,
02516 volsz.x, volsz.y, volsz.z, volmap);
02517 #endif
02518
02519 double voltime = wkf_timer_timenow(timer);
02520 free(xyzr);
02521
02522 if (verbose) {
02523 char strmsg[1024];
02524 sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f vol:%.3f]",
02525 voltime, pretime, voltime-pretime);
02526 msgInfo << strmsg << sendmsg;
02527 }
02528 wkf_timer_destroy(timer);
02529
02530 return 0;
02531 }
02532 #endif
02533
02534
02535
02536