00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00042 #include <stdio.h>
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <cuda.h>
00046 #if CUDART_VERSION >= 9000
00047 #include <cuda_fp16.h>
00048 #endif
00049
00050 #if CUDART_VERSION < 4000
00051 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00052 #endif
00053
00054 #include "Inform.h"
00055 #include "utilities.h"
00056 #include "WKFThreads.h"
00057 #include "WKFUtils.h"
00058 #include "CUDAKernels.h"
00059 #include "CUDASpatialSearch.h"
00060 #include "CUDAMarchingCubes.h"
00061 #include "CUDAQuickSurf.h"
00062
00063 #include "DispCmds.h"
00064 #include "VMDDisplayList.h"
00065
00066 #if 1
00067 #define CUERR { cudaError_t err; \
00068 if ((err = cudaGetLastError()) != cudaSuccess) { \
00069 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00070 printf("Thread aborting...\n"); \
00071 return NULL; }}
00072 #else
00073 #define CUERR
00074 #endif
00075
00076
00077
00078
00079
00080
00081
00082 inline __host__ __device__ float3 operator*(float3 a, float b) {
00083 return make_float3(b * a.x, b * a.y, b * a.z);
00084 }
00085
00086
00087 inline __host__ __device__ void operator+=(float3 &a, float3 b) {
00088 a.x += b.x;
00089 a.y += b.y;
00090 a.z += b.z;
00091 }
00092
00093
00094 inline __device__ float3 make_float3(float4 a) {
00095 float3 b;
00096 b.x = a.x;
00097 b.y = a.y;
00098 b.z = a.z;
00099 return b;
00100 }
00101
00102
00103
00104
00105
00106
00107
00108 inline __device__ void convert_density(float & df, float df2) {
00109 df = df2;
00110 }
00111
00112
00113
00114 inline __device__ void convert_density(unsigned short & dh, float df2) {
00115 dh = __float2half_rn(df2);
00116 }
00117
00118
00119
00120
00121
00122
00123
00124
00125 inline __device__ void convert_color(float3 & cf, float3 cf2, float invisovalue) {
00126 cf = cf2 * invisovalue;
00127 }
00128
00129
00130
00131
00132 inline __device__ void convert_color(uchar4 & cu, float3 cf, float invisovalue) {
00133
00134
00135
00136 #if 1
00137 float3 cftmp = cf * invisovalue;
00138 cu = make_uchar4(fminf(cftmp.x * 255.0f, 255.0f),
00139 fminf(cftmp.y * 255.0f, 255.0f),
00140 fminf(cftmp.z * 255.0f, 255.0f),
00141 255);
00142 #else
00143
00144
00145 float invmaxcolscale = __frcp_rn(fmaxf(fmaxf(fmaxf(cf.x, cf.y), cf.z), 1.0f)) * 255.0f;
00146
00147
00148 cu = make_uchar4(cf.x * invmaxcolscale,
00149 cf.y * invmaxcolscale,
00150 cf.z * invmaxcolscale,
00151 255);
00152 #endif
00153 }
00154
00155
00156
00157 inline __device__ void convert_color(float3 & cf, uchar4 cu) {
00158 const float i2f = 1.0f / 255.0f;
00159 cf.x = cu.x * i2f;
00160 cf.y = cu.y * i2f;
00161 cf.z = cu.z * i2f;
00162 }
00163
00164
00165
00166
00167
00168 #if 1 && (CUDART_VERSION >= 4000)
00169 #define RESTRICT __restrict__
00170 #else
00171 #define RESTRICT
00172 #endif
00173
00174
00175
00176
00177 #define GGRIDSZ 8.0f
00178 #define GBLOCKSZX 8
00179 #define GBLOCKSZY 8
00180
00181 #if 1
00182 #define GTEXBLOCKSZZ 2
00183 #define GTEXUNROLL 4
00184 #define GBLOCKSZZ 2
00185 #define GUNROLL 4
00186 #else
00187 #define GTEXBLOCKSZZ 8
00188 #define GTEXUNROLL 1
00189 #define GBLOCKSZZ 8
00190 #define GUNROLL 1
00191 #endif
00192
00193 #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
00194 #if __CUDA_ARCH__ >= 600
00195 #define MINBLOCKDENS 16
00196 #elif __CUDA_ARCH__ >= 300
00197 #define MINBLOCKDENS 16
00198 #elif __CUDA_ARCH__ >= 200
00199 #define MINBLOCKDENS 1
00200 #else
00201 #define MINBLOCKDENS 1
00202 #endif
00203
00204
00205
00206
00207
00208
00209
00210
00211 template<class DENSITY, class VOLTEX>
00212 __global__ static void
00213 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00214 gaussdensity_fast_tex_norm(int natoms,
00215 const float4 * RESTRICT sorted_xyzr,
00216 const float4 * RESTRICT sorted_color,
00217 int3 volsz,
00218 int3 acncells,
00219 float acgridspacing,
00220 float invacgridspacing,
00221 const uint2 * RESTRICT cellStartEnd,
00222 float gridspacing, unsigned int z,
00223 DENSITY * RESTRICT densitygrid,
00224 VOLTEX * RESTRICT voltexmap,
00225 float invisovalue) {
00226 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00227 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00228 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00229
00230
00231 unsigned int outaddr = zindex * volsz.x * volsz.y +
00232 yindex * volsz.x + xindex;
00233
00234
00235 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00236 return;
00237
00238 zindex += z;
00239
00240
00241 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00242 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00243 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00244
00245
00246 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00247 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00248 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00249
00250 xabmin = (xabmin < 0) ? 0 : xabmin;
00251 yabmin = (yabmin < 0) ? 0 : yabmin;
00252 zabmin = (zabmin < 0) ? 0 : zabmin;
00253 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00254 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00255 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00256
00257 float coorx = gridspacing * xindex;
00258 float coory = gridspacing * yindex;
00259 float coorz = gridspacing * zindex;
00260
00261 float densityval1=0.0f;
00262 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00263 #if GTEXUNROLL >= 2
00264 float densityval2=0.0f;
00265 float3 densitycol2=densitycol1;
00266 #endif
00267 #if GTEXUNROLL >= 4
00268 float densityval3=0.0f;
00269 float3 densitycol3=densitycol1;
00270 float densityval4=0.0f;
00271 float3 densitycol4=densitycol1;
00272 #endif
00273
00274 int acplanesz = acncells.x * acncells.y;
00275 int xab, yab, zab;
00276 for (zab=zabmin; zab<=zabmax; zab++) {
00277 for (yab=yabmin; yab<=yabmax; yab++) {
00278 for (xab=xabmin; xab<=xabmax; xab++) {
00279 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00280
00281
00282 uint2 atomstartend = cellStartEnd[abcellidx];
00283 if (atomstartend.x != GRID_CELL_EMPTY) {
00284 unsigned int atomid;
00285 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00286 float4 atom = sorted_xyzr[atomid];
00287 float4 color = sorted_color[atomid];
00288 float dx = coorx - atom.x;
00289 float dy = coory - atom.y;
00290 float dxy2 = dx*dx + dy*dy;
00291
00292 float dz = coorz - atom.z;
00293 float r21 = (dxy2 + dz*dz) * atom.w;
00294 float tmp1 = exp2f(r21);
00295 densityval1 += tmp1;
00296 densitycol1.x += tmp1 * color.x;
00297 densitycol1.y += tmp1 * color.y;
00298 densitycol1.z += tmp1 * color.z;
00299
00300 #if GTEXUNROLL >= 2
00301 float dz2 = dz + gridspacing;
00302 float r22 = (dxy2 + dz2*dz2) * atom.w;
00303 float tmp2 = exp2f(r22);
00304 densityval2 += tmp2;
00305 densitycol2.x += tmp2 * color.x;
00306 densitycol2.y += tmp2 * color.y;
00307 densitycol2.z += tmp2 * color.z;
00308 #endif
00309 #if GTEXUNROLL >= 4
00310 float dz3 = dz2 + gridspacing;
00311 float r23 = (dxy2 + dz3*dz3) * atom.w;
00312 float tmp3 = exp2f(r23);
00313 densityval3 += tmp3;
00314 densitycol3.x += tmp3 * color.x;
00315 densitycol3.y += tmp3 * color.y;
00316 densitycol3.z += tmp3 * color.z;
00317
00318 float dz4 = dz3 + gridspacing;
00319 float r24 = (dxy2 + dz4*dz4) * atom.w;
00320 float tmp4 = exp2f(r24);
00321 densityval4 += tmp4;
00322 densitycol4.x += tmp4 * color.x;
00323 densitycol4.y += tmp4 * color.y;
00324 densitycol4.z += tmp4 * color.z;
00325 #endif
00326 }
00327 }
00328 }
00329 }
00330 }
00331
00332 DENSITY densityout;
00333 VOLTEX texout;
00334 convert_density(densityout, densityval1 * invisovalue);
00335 densitygrid[outaddr ] = densityout;
00336 convert_color(texout, densitycol1, invisovalue);
00337 voltexmap[outaddr ] = texout;
00338
00339 #if GTEXUNROLL >= 2
00340 int planesz = volsz.x * volsz.y;
00341 convert_density(densityout, densityval2 * invisovalue);
00342 densitygrid[outaddr + planesz] = densityout;
00343 convert_color(texout, densitycol2, invisovalue);
00344 voltexmap[outaddr + planesz] = texout;
00345 #endif
00346 #if GTEXUNROLL >= 4
00347 convert_density(densityout, densityval3 * invisovalue);
00348 densitygrid[outaddr + 2*planesz] = densityout;
00349 convert_color(texout, densitycol3, invisovalue);
00350 voltexmap[outaddr + 2*planesz] = texout;
00351
00352 convert_density(densityout, densityval4 * invisovalue);
00353 densitygrid[outaddr + 3*planesz] = densityout;
00354 convert_color(texout, densitycol4, invisovalue);
00355 voltexmap[outaddr + 3*planesz] = texout;
00356 #endif
00357 }
00358
00359
00360 __global__ static void
00361 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00362 gaussdensity_fast_tex3f(int natoms,
00363 const float4 * RESTRICT sorted_xyzr,
00364 const float4 * RESTRICT sorted_color,
00365 int3 volsz,
00366 int3 acncells,
00367 float acgridspacing,
00368 float invacgridspacing,
00369 const uint2 * RESTRICT cellStartEnd,
00370 float gridspacing, unsigned int z,
00371 float * RESTRICT densitygrid,
00372 float3 * RESTRICT voltexmap,
00373 float invisovalue) {
00374 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00375 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00376 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00377
00378
00379 unsigned int outaddr = zindex * volsz.x * volsz.y +
00380 yindex * volsz.x + xindex;
00381
00382
00383 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00384 return;
00385
00386 zindex += z;
00387
00388
00389 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00390 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00391 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00392
00393
00394 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00395 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00396 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00397
00398 xabmin = (xabmin < 0) ? 0 : xabmin;
00399 yabmin = (yabmin < 0) ? 0 : yabmin;
00400 zabmin = (zabmin < 0) ? 0 : zabmin;
00401 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00402 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00403 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00404
00405 float coorx = gridspacing * xindex;
00406 float coory = gridspacing * yindex;
00407 float coorz = gridspacing * zindex;
00408
00409 float densityval1=0.0f;
00410 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00411 #if GTEXUNROLL >= 2
00412 float densityval2=0.0f;
00413 float3 densitycol2=densitycol1;
00414 #endif
00415 #if GTEXUNROLL >= 4
00416 float densityval3=0.0f;
00417 float3 densitycol3=densitycol1;
00418 float densityval4=0.0f;
00419 float3 densitycol4=densitycol1;
00420 #endif
00421
00422 int acplanesz = acncells.x * acncells.y;
00423 int xab, yab, zab;
00424 for (zab=zabmin; zab<=zabmax; zab++) {
00425 for (yab=yabmin; yab<=yabmax; yab++) {
00426 for (xab=xabmin; xab<=xabmax; xab++) {
00427 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00428
00429
00430 uint2 atomstartend = cellStartEnd[abcellidx];
00431 if (atomstartend.x != GRID_CELL_EMPTY) {
00432 unsigned int atomid;
00433 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00434 float4 atom = sorted_xyzr[atomid];
00435 float3 color = make_float3(sorted_color[atomid]);
00436 float dx = coorx - atom.x;
00437 float dy = coory - atom.y;
00438 float dxy2 = dx*dx + dy*dy;
00439
00440 float dz = coorz - atom.z;
00441 float r21 = (dxy2 + dz*dz) * atom.w;
00442 float tmp1 = exp2f(r21);
00443 densityval1 += tmp1;
00444 densitycol1 += color * tmp1;
00445
00446 #if GTEXUNROLL >= 2
00447 float dz2 = dz + gridspacing;
00448 float r22 = (dxy2 + dz2*dz2) * atom.w;
00449 float tmp2 = exp2f(r22);
00450 densityval2 += tmp2;
00451 densitycol2 += color * tmp2;
00452 #endif
00453 #if GTEXUNROLL >= 4
00454 float dz3 = dz2 + gridspacing;
00455 float r23 = (dxy2 + dz3*dz3) * atom.w;
00456 float tmp3 = exp2f(r23);
00457 densityval3 += tmp3;
00458 densitycol3 += color * tmp3;
00459
00460 float dz4 = dz3 + gridspacing;
00461 float r24 = (dxy2 + dz4*dz4) * atom.w;
00462 float tmp4 = exp2f(r24);
00463 densityval4 += tmp4;
00464 densitycol4 += color * tmp4;
00465 #endif
00466 }
00467 }
00468 }
00469 }
00470 }
00471
00472 float3 texout;
00473 densitygrid[outaddr ] = densityval1;
00474 convert_color(texout, densitycol1, invisovalue);
00475 voltexmap[outaddr ] = texout;
00476
00477 #if GTEXUNROLL >= 2
00478 int planesz = volsz.x * volsz.y;
00479 densitygrid[outaddr + planesz] = densityval2;
00480 convert_color(texout, densitycol2, invisovalue);
00481 voltexmap[outaddr + planesz] = texout;
00482 #endif
00483 #if GTEXUNROLL >= 4
00484 densitygrid[outaddr + 2*planesz] = densityval3;
00485 convert_color(texout, densitycol3, invisovalue);
00486 voltexmap[outaddr + 2*planesz] = texout;
00487
00488 densitygrid[outaddr + 3*planesz] = densityval4;
00489 convert_color(texout, densitycol4, invisovalue);
00490 voltexmap[outaddr + 3*planesz] = texout;
00491 #endif
00492 }
00493
00494
00495 __global__ static void
00496
00497 gaussdensity_fast(int natoms,
00498 const float4 * RESTRICT sorted_xyzr,
00499 int3 volsz,
00500 int3 acncells,
00501 float acgridspacing,
00502 float invacgridspacing,
00503 const uint2 * RESTRICT cellStartEnd,
00504 float gridspacing, unsigned int z,
00505 float * RESTRICT densitygrid) {
00506 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00507 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00508 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00509 unsigned int outaddr = zindex * volsz.x * volsz.y +
00510 yindex * volsz.x +
00511 xindex;
00512
00513
00514 if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00515 return;
00516
00517 zindex += z;
00518
00519
00520 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00521 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00522 int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00523
00524
00525 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00526 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00527 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00528
00529 xabmin = (xabmin < 0) ? 0 : xabmin;
00530 yabmin = (yabmin < 0) ? 0 : yabmin;
00531 zabmin = (zabmin < 0) ? 0 : zabmin;
00532 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00533 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00534 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00535
00536 float coorx = gridspacing * xindex;
00537 float coory = gridspacing * yindex;
00538 float coorz = gridspacing * zindex;
00539
00540 float densityval1=0.0f;
00541 #if GUNROLL >= 2
00542 float densityval2=0.0f;
00543 #endif
00544 #if GUNROLL >= 4
00545 float densityval3=0.0f;
00546 float densityval4=0.0f;
00547 #endif
00548
00549 int acplanesz = acncells.x * acncells.y;
00550 int xab, yab, zab;
00551 for (zab=zabmin; zab<=zabmax; zab++) {
00552 for (yab=yabmin; yab<=yabmax; yab++) {
00553 for (xab=xabmin; xab<=xabmax; xab++) {
00554 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00555 uint2 atomstartend = cellStartEnd[abcellidx];
00556 if (atomstartend.x != GRID_CELL_EMPTY) {
00557 unsigned int atomid;
00558 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00559 float4 atom = sorted_xyzr[atomid];
00560 float dx = coorx - atom.x;
00561 float dy = coory - atom.y;
00562 float dxy2 = dx*dx + dy*dy;
00563
00564 float dz = coorz - atom.z;
00565 float r21 = (dxy2 + dz*dz) * atom.w;
00566 densityval1 += exp2f(r21);
00567
00568 #if GUNROLL >= 2
00569 float dz2 = dz + gridspacing;
00570 float r22 = (dxy2 + dz2*dz2) * atom.w;
00571 densityval2 += exp2f(r22);
00572 #endif
00573 #if GUNROLL >= 4
00574 float dz3 = dz2 + gridspacing;
00575 float r23 = (dxy2 + dz3*dz3) * atom.w;
00576 densityval3 += exp2f(r23);
00577
00578 float dz4 = dz3 + gridspacing;
00579 float r24 = (dxy2 + dz4*dz4) * atom.w;
00580 densityval4 += exp2f(r24);
00581 #endif
00582 }
00583 }
00584 }
00585 }
00586 }
00587
00588 densitygrid[outaddr ] = densityval1;
00589 #if GUNROLL >= 2
00590 int planesz = volsz.x * volsz.y;
00591 densitygrid[outaddr + planesz] = densityval2;
00592 #endif
00593 #if GUNROLL >= 4
00594 densitygrid[outaddr + 2*planesz] = densityval3;
00595 densitygrid[outaddr + 3*planesz] = densityval4;
00596 #endif
00597 }
00598
00599
00600
00601 typedef struct {
00602 cudaDeviceProp deviceProp;
00603 int dev;
00604 int verbose;
00605
00607 long int natoms;
00608 int colorperatom;
00609 CUDAQuickSurf::VolTexFormat vtexformat;
00610 int acx;
00611 int acy;
00612 int acz;
00613 int gx;
00614 int gy;
00615 int gz;
00616
00617 CUDAMarchingCubes *mc;
00618
00619 float *devdensity;
00620 void *devvoltexmap;
00621 float4 *xyzr_d;
00622 float4 *sorted_xyzr_d;
00623 float4 *color_d;
00624 float4 *sorted_color_d;
00625
00626 unsigned int *atomIndex_d;
00627 unsigned int *atomHash_d;
00628 uint2 *cellStartEnd_d;
00629
00630 void *safety;
00631
00632 float3 *v3f_d;
00633 float3 *n3f_d;
00634 float3 *c3f_d;
00635 char3 *n3b_d;
00636 uchar4 *c4u_d;
00637 } qsurf_gpuhandle;
00638
00639
00640 CUDAQuickSurf::CUDAQuickSurf() {
00641 voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
00642 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00643
00644 if (getenv("VMDQUICKSURFVERBOSE") != NULL) {
00645 gpuh->verbose = 1;
00646 int tmp = atoi(getenv("VMDQUICKSURFVERBOSE"));
00647 if (tmp > 0)
00648 gpuh->verbose = tmp;
00649 }
00650
00651 gpuh->vtexformat = RGB3F;
00652
00653 if (cudaGetDevice(&gpuh->dev) != cudaSuccess) {
00654 gpuh->dev = -1;
00655 }
00656
00657 if (cudaGetDeviceProperties(&gpuh->deviceProp, gpuh->dev) != cudaSuccess) {
00658 cudaError_t err = cudaGetLastError();
00659 gpuh->dev = -1;
00660 }
00661 }
00662
00663
00664 CUDAQuickSurf::~CUDAQuickSurf() {
00665 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00666
00667
00668 free_bufs();
00669
00670
00671 delete gpuh->mc;
00672
00673 free(voidgpu);
00674 }
00675
00676
00677 int CUDAQuickSurf::free_bufs() {
00678 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00679
00680
00681 gpuh->natoms = 0;
00682 gpuh->colorperatom = 0;
00683 gpuh->acx = 0;
00684 gpuh->acy = 0;
00685 gpuh->acz = 0;
00686 gpuh->gx = 0;
00687 gpuh->gy = 0;
00688 gpuh->gz = 0;
00689
00690 if (gpuh->safety != NULL)
00691 cudaFree(gpuh->safety);
00692 gpuh->safety=NULL;
00693
00694 if (gpuh->devdensity != NULL)
00695 cudaFree(gpuh->devdensity);
00696 gpuh->devdensity=NULL;
00697
00698 if (gpuh->devvoltexmap != NULL)
00699 cudaFree(gpuh->devvoltexmap);
00700 gpuh->devvoltexmap=NULL;
00701
00702 if (gpuh->xyzr_d != NULL)
00703 cudaFree(gpuh->xyzr_d);
00704 gpuh->xyzr_d=NULL;
00705
00706 if (gpuh->sorted_xyzr_d != NULL)
00707 cudaFree(gpuh->sorted_xyzr_d);
00708 gpuh->sorted_xyzr_d=NULL;
00709
00710 if (gpuh->color_d != NULL)
00711 cudaFree(gpuh->color_d);
00712 gpuh->color_d=NULL;
00713
00714 if (gpuh->sorted_color_d != NULL)
00715 cudaFree(gpuh->sorted_color_d);
00716 gpuh->sorted_color_d=NULL;
00717
00718 if (gpuh->atomIndex_d != NULL)
00719 cudaFree(gpuh->atomIndex_d);
00720 gpuh->atomIndex_d=NULL;
00721
00722 if (gpuh->atomHash_d != NULL)
00723 cudaFree(gpuh->atomHash_d);
00724 gpuh->atomHash_d=NULL;
00725
00726 if (gpuh->cellStartEnd_d != NULL)
00727 cudaFree(gpuh->cellStartEnd_d);
00728 gpuh->cellStartEnd_d=NULL;
00729
00730 if (gpuh->v3f_d != NULL)
00731 cudaFree(gpuh->v3f_d);
00732 gpuh->v3f_d=NULL;
00733
00734 if (gpuh->n3f_d != NULL)
00735 cudaFree(gpuh->n3f_d);
00736 gpuh->n3f_d=NULL;
00737
00738 if (gpuh->c3f_d != NULL)
00739 cudaFree(gpuh->c3f_d);
00740 gpuh->c3f_d=NULL;
00741
00742 if (gpuh->n3b_d != NULL)
00743 cudaFree(gpuh->n3b_d);
00744 gpuh->n3b_d=NULL;
00745
00746 if (gpuh->c4u_d != NULL)
00747 cudaFree(gpuh->c4u_d);
00748 gpuh->c4u_d=NULL;
00749
00750
00751 return 0;
00752 }
00753
00754
00755 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom,
00756 VolTexFormat vtexformat,
00757 int acx, int acy, int acz,
00758 int gx, int gy, int gz) {
00759 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00760
00761
00762
00763
00764
00765
00766 if (natoms <= gpuh->natoms &&
00767 colorperatom <= gpuh->colorperatom &&
00768 vtexformat == gpuh->vtexformat &&
00769 (acx*acy*acz) <= (gpuh->acx * gpuh->acy * gpuh->acz) &&
00770 (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
00771 return 0;
00772
00773 return -1;
00774 }
00775
00776
00777 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom,
00778 VolTexFormat vtexformat,
00779 int acx, int acy, int acz,
00780 int gx, int gy, int gz) {
00781 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00782
00783
00784
00785 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, gx, gy, gz) == 0)
00786 return 0;
00787
00788
00789
00790 free_bufs();
00791
00792 long int acncells = ((long) acx) * ((long) acy) * ((long) acz);
00793 long int ncells = ((long) gx) * ((long) gy) * ((long) gz);
00794 long int volmemsz = ncells * sizeof(float);
00795 long int chunkmaxverts = 3L * ncells;
00796 long int MCsz = CUDAMarchingCubes::MemUsageMC(gx, gy, gz);
00797
00798
00799
00800
00801 long int totalmemsz =
00802 volmemsz +
00803 (2L * natoms * sizeof(unsigned int)) +
00804 (acncells * sizeof(uint2)) +
00805 (3L * chunkmaxverts * sizeof(float3)) +
00806 natoms*sizeof(float4) +
00807 8L * gx * gy * sizeof(float) +
00808 MCsz;
00809
00810 cudaMalloc((void**)&gpuh->devdensity, volmemsz);
00811 if (colorperatom) {
00812 int voltexsz = 0;
00813 switch (vtexformat) {
00814 case RGB3F:
00815 voltexsz = ncells * sizeof(float3);
00816 break;
00817
00818 case RGB4U:
00819 voltexsz = ncells * sizeof(uchar4);
00820 break;
00821 }
00822 gpuh->vtexformat = vtexformat;
00823
00824 cudaMalloc((void**)&gpuh->devvoltexmap, voltexsz);
00825 cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
00826 cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
00827 totalmemsz += 2 * natoms * sizeof(float4);
00828 }
00829 cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
00830 cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
00831 cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
00832 cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
00833 cudaMalloc((void**)&gpuh->cellStartEnd_d, acncells * sizeof(uint2));
00834
00835
00836 cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
00837 #if 1
00838
00839 cudaMalloc((void**)&gpuh->n3b_d, 3 * chunkmaxverts * sizeof(char3));
00840 totalmemsz += 3 * chunkmaxverts * sizeof(char3);
00841 #else
00842 cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
00843 totalmemsz += 3 * chunkmaxverts * sizeof(float3);
00844 #endif
00845 #if 1
00846 cudaMalloc((void**)&gpuh->c4u_d, 3 * chunkmaxverts * sizeof(uchar4));
00847 totalmemsz += 3 * chunkmaxverts * sizeof(uchar4);
00848 #else
00849 cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
00850 totalmemsz += 3 * chunkmaxverts * sizeof(float3);
00851 #endif
00852
00853
00854
00855
00856
00857
00858 cudaMalloc(&gpuh->safety,
00859 natoms*sizeof(float4) +
00860 8 * gx * gy * sizeof(float) +
00861 MCsz);
00862
00863 if (gpuh->verbose > 1)
00864 printf("Total QuickSurf mem size: %ld MB\n", totalmemsz / (1024*1024));
00865
00866 cudaError_t err = cudaGetLastError();
00867 if (err != cudaSuccess)
00868 return -1;
00869
00870
00871
00872 gpuh->natoms = natoms;
00873 gpuh->colorperatom = colorperatom;
00874
00875 gpuh->acx = acx;
00876 gpuh->acy = acy;
00877 gpuh->acz = acz;
00878
00879 gpuh->gx = gx;
00880 gpuh->gy = gy;
00881 gpuh->gz = gz;
00882
00883 return 0;
00884 }
00885
00886
00887 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
00888 long int natoms, int colorperatom,
00889 VolTexFormat vtexformat,
00890 int acx, int acy, int acz,
00891 int gx, int gy, int gz,
00892 int &cx, int &cy, int &cz,
00893 int &sx, int &sy, int &sz) {
00894 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
00895 if (colorperatom)
00896 Bsz.z = GTEXBLOCKSZZ;
00897
00898 cudaError_t err = cudaGetLastError();
00899
00900
00901
00902
00903
00904 cz <<= 1;
00905 int chunkiters = 0;
00906 int chunkallocated = 0;
00907 while (!chunkallocated) {
00908
00909 chunkiters++;
00910 cz >>= 1;
00911
00912
00913
00914
00915
00916 if (cz != gz)
00917 cz-=4;
00918
00919
00920
00921 cz += (8 - (cz % 8));
00922
00923
00924
00925
00926 sx = cx;
00927 sy = cy;
00928 sz = cz;
00929
00930
00931
00932 cz+=4;
00933
00934 #if 0
00935 printf(" Trying slab size: %d (test: %d)\n", sz, testexisting);
00936 #endif
00937
00938 #if 1
00939
00940
00941 dim3 tGsz((sx+Bsz.x-1) / Bsz.x,
00942 (sy+Bsz.y-1) / Bsz.y,
00943 (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
00944 if (colorperatom) {
00945 tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
00946 }
00947 if (tGsz.x * tGsz.y * tGsz.z > 65535)
00948 continue;
00949 #endif
00950
00951
00952
00953
00954 if (sz <= 8) {
00955 return -1;
00956 }
00957
00958 if (testexisting) {
00959 if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00960 continue;
00961 } else {
00962 if (alloc_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00963 continue;
00964 }
00965
00966 chunkallocated=1;
00967 }
00968
00969 return 0;
00970 }
00971
00972
00973 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f,
00974 const float *colors_f,
00975 int colorperatom, VolTexFormat voltexformat,
00976 float *origin, int *numvoxels, float maxrad,
00977 float radscale, float gridspacing,
00978 float isovalue, float gausslim,
00979 VMDDisplayList *cmdList) {
00980 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00981
00982
00983
00984 if (gpuh->dev < 0)
00985 return -1;
00986
00987
00988
00989
00990
00991
00992 if (gpuh->deviceProp.major < 2) {
00993 return -1;
00994 }
00995
00996
00997 wkf_timerhandle globaltimer = wkf_timer_create();
00998 wkf_timer_start(globaltimer);
00999
01000 int vtexsize = 0;
01001
01002
01003 switch (voltexformat) {
01004 case RGB3F:
01005 vtexsize = sizeof(float3);
01006 break;
01007
01008 case RGB4U:
01009 vtexsize = sizeof(uchar4);
01010 break;
01011 }
01012
01013 float4 *colors = (float4 *) colors_f;
01014 int chunkmaxverts=0;
01015 int chunknumverts=0;
01016 int numverts=0;
01017 int numfacets=0;
01018
01019
01020 float acgridspacing = gausslim * radscale * maxrad;
01021
01022
01023 if (acgridspacing < gridspacing)
01024 acgridspacing = gridspacing;
01025
01026
01027
01028
01029 int3 volsz = make_int3(numvoxels[0], numvoxels[1], numvoxels[2]);
01030 int3 chunksz = volsz;
01031 int3 slabsz = volsz;
01032
01033
01034 if (getenv("VMDQUICKSURFMINMEM")) {
01035 if (volsz.z > 32) {
01036 slabsz.z = chunksz.z = 16;
01037 }
01038 }
01039
01040 int3 accelcells;
01041 accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01042 accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01043 accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01044
01045 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01046 if (colorperatom)
01047 Bsz.z = GTEXBLOCKSZZ;
01048
01049
01050
01051
01052 if (gpuh->natoms == 0 ||
01053 get_chunk_bufs(1, natoms, colorperatom, voltexformat,
01054 accelcells.x, accelcells.y, accelcells.z,
01055 volsz.x, volsz.y, volsz.z,
01056 chunksz.x, chunksz.y, chunksz.z,
01057 slabsz.x, slabsz.y, slabsz.z) == -1) {
01058
01059
01060 chunksz = volsz;
01061 slabsz = volsz;
01062
01063
01064
01065 if (get_chunk_bufs(0, natoms, colorperatom, voltexformat,
01066 accelcells.x, accelcells.y, accelcells.z,
01067 volsz.x, volsz.y, volsz.z,
01068 chunksz.x, chunksz.y, chunksz.z,
01069 slabsz.x, slabsz.y, slabsz.z) == -1) {
01070 wkf_timer_destroy(globaltimer);
01071 return -1;
01072 }
01073 }
01074 chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01075
01076
01077
01078 if (gpuh->safety != NULL)
01079 cudaFree(gpuh->safety);
01080 gpuh->safety = NULL;
01081
01082 if (gpuh->verbose > 1) {
01083 printf(" Using GPU chunk size: %d\n", chunksz.z);
01084 printf(" Accel grid(%d, %d, %d) spacing %f\n",
01085 accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01086 }
01087
01088
01089
01090 int i, i4;
01091 float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01092 float log2e = log2f(2.718281828f);
01093 for (i=0,i4=0; i<natoms; i++,i4+=4) {
01094 xyzr[i].x = xyzr_f[i4 ];
01095 xyzr[i].y = xyzr_f[i4 + 1];
01096 xyzr[i].z = xyzr_f[i4 + 2];
01097
01098 float scaledrad = xyzr_f[i4 + 3] * radscale;
01099 float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01100
01101 xyzr[i].w = arinv;
01102 }
01103 cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01104 free(xyzr);
01105
01106 if (colorperatom)
01107 cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01108
01109
01110 if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01111 gpuh->sorted_xyzr_d,
01112 gpuh->sorted_color_d,
01113 gpuh->atomIndex_d, gpuh->atomHash_d,
01114 gpuh->cellStartEnd_d,
01115 accelcells, 1.0f / acgridspacing) != 0) {
01116 wkf_timer_destroy(globaltimer);
01117 free_bufs();
01118 return -1;
01119 }
01120
01121 double sorttime = wkf_timer_timenow(globaltimer);
01122 double lastlooptime = sorttime;
01123
01124 double densitykerneltime = 0.0f;
01125 double densitytime = 0.0f;
01126 double mckerneltime = 0.0f;
01127 double mctime = 0.0f;
01128 double copycalltime = 0.0f;
01129 double copytime = 0.0f;
01130
01131 float *volslab_d = NULL;
01132 void *texslab_d = NULL;
01133
01134 int lzplane = GBLOCKSZZ * GUNROLL;
01135 if (colorperatom)
01136 lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01137
01138
01139 uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01140 if (gpuh->mc == NULL) {
01141 gpuh->mc = new CUDAMarchingCubes();
01142 if (!gpuh->mc->Initialize(mgsz)) {
01143 printf("QuickSurf call to MC Initialize() failed\n");
01144 }
01145 } else {
01146 uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01147 if (slabsz.x > mcmaxgridsize.x ||
01148 slabsz.y > mcmaxgridsize.y ||
01149 slabsz.z > mcmaxgridsize.z) {
01150 if (gpuh->verbose)
01151 printf("*** QuickSurf Allocating new MC object...\n");
01152
01153
01154 delete gpuh->mc;
01155
01156
01157 gpuh->mc = new CUDAMarchingCubes();
01158
01159 if (!gpuh->mc->Initialize(mgsz)) {
01160 printf("QuickSurf MC Initialize() call failed to recreate MC object\n");
01161 }
01162 }
01163 }
01164
01165 int z;
01166 int chunkcount=0;
01167 float invacgridspacing = 1.0f / acgridspacing;
01168 float invisovalue = 1.0f / isovalue;
01169 for (z=0; z<volsz.z; z+=slabsz.z) {
01170 int3 curslab = slabsz;
01171 if (z+curslab.z > volsz.z)
01172 curslab.z = volsz.z - z;
01173
01174 int slabplanesz = curslab.x * curslab.y;
01175
01176 dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x,
01177 (curslab.y+Bsz.y-1) / Bsz.y,
01178 (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01179 if (colorperatom)
01180 Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01181
01182
01183
01184 dim3 Gszslice = Gsz;
01185
01186 if (gpuh->verbose > 1) {
01187 printf("CUDA device %d, grid size %dx%dx%d\n",
01188 0, Gsz.x, Gsz.y, Gsz.z);
01189 printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01190 curslab.x, curslab.y, curslab.z,
01191 accelcells.x, accelcells.y, accelcells.z);
01192 printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01193 }
01194
01195
01196
01197
01198 if (z == 0) {
01199 volslab_d = gpuh->devdensity;
01200 if (colorperatom)
01201 texslab_d = gpuh->devvoltexmap;
01202 } else {
01203 int fourplanes = 4 * slabplanesz;
01204 cudaMemcpy(gpuh->devdensity,
01205 volslab_d + (slabsz.z-4) * slabplanesz,
01206 fourplanes * sizeof(float), cudaMemcpyDeviceToDevice);
01207 volslab_d = gpuh->devdensity + fourplanes;
01208
01209 if (colorperatom) {
01210 cudaMemcpy(gpuh->devvoltexmap,
01211 ((unsigned char *) texslab_d) + (slabsz.z-4) * slabplanesz * vtexsize,
01212 fourplanes * vtexsize, cudaMemcpyDeviceToDevice);
01213 texslab_d = ((unsigned char *) gpuh->devvoltexmap) + fourplanes * vtexsize;
01214 }
01215 }
01216
01217
01218 for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01219 int lzinc = lz * lzplane;
01220 float *volslice_d = volslab_d + lzinc * slabplanesz;
01221
01222 if (colorperatom) {
01223 void *texslice_d = ((unsigned char *) texslab_d) + lzinc * slabplanesz * vtexsize;
01224 switch (voltexformat) {
01225 case RGB3F:
01226 gaussdensity_fast_tex3f<<<Gszslice, Bsz, 0>>>(natoms,
01227 gpuh->sorted_xyzr_d, gpuh->sorted_color_d,
01228 curslab, accelcells, acgridspacing, invacgridspacing,
01229 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01230 volslice_d, (float3 *) texslice_d, invisovalue);
01231 break;
01232
01233 case RGB4U:
01234 gaussdensity_fast_tex_norm<float, uchar4><<<Gszslice, Bsz, 0>>>(natoms,
01235 gpuh->sorted_xyzr_d, gpuh->sorted_color_d,
01236 curslab, accelcells, acgridspacing, invacgridspacing,
01237 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01238 volslice_d, (uchar4 *) texslice_d, invisovalue);
01239 break;
01240 }
01241 } else {
01242 gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, gpuh->sorted_xyzr_d,
01243 curslab, accelcells, acgridspacing, invacgridspacing,
01244 gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01245 }
01246 }
01247 cudaDeviceSynchronize();
01248 densitykerneltime = wkf_timer_timenow(globaltimer);
01249
01250 #if 0
01251 printf(" CUDA mcubes..."); fflush(stdout);
01252 #endif
01253
01254 uint3 gvsz = make_uint3(curslab.x, curslab.y, curslab.z);
01255
01256
01257
01258
01259 if (z != 0)
01260 gvsz.z=curslab.z + 4;
01261
01262 float3 bbox = make_float3(gvsz.x * gridspacing, gvsz.y * gridspacing,
01263 gvsz.z * gridspacing);
01264
01265 float3 gorigin = make_float3(origin[0], origin[1],
01266 origin[2] + (z * gridspacing));
01267 if (z != 0)
01268 gorigin.z = origin[2] + ((z-4) * gridspacing);
01269
01270 #if 0
01271 printf("\n ... vsz: %d %d %d\n", gvsz.x, gvsz.y, gvsz.z);
01272 printf(" ... org: %.2f %.2f %.2f\n", gorigin.x, gorigin.y, gorigin.z);
01273 printf(" ... bxs: %.2f %.2f %.2f\n", bbox.x, bbox.y, bbox.y);
01274 printf(" ... bbe: %.2f %.2f %.2f\n", gorigin.x+bbox.x, gorigin.y+bbox.y, gorigin.z+bbox.z);
01275 #endif
01276
01277
01278
01279
01280
01281 int skipstartplane=0;
01282 int skipendplane=0;
01283 if (chunksz.z < volsz.z) {
01284
01285 if (z != 0)
01286 skipstartplane=1;
01287
01288
01289 if (z+curslab.z < volsz.z)
01290 skipendplane=1;
01291 }
01292
01293
01294
01295
01296
01297
01298
01299 if (colorperatom && (voltexformat == RGB4U)) {
01300
01301
01302 gpuh->mc->SetIsovalue(1.0f);
01303 } else {
01304 gpuh->mc->SetIsovalue(isovalue);
01305 }
01306
01307 int mcstat = 0;
01308 switch (voltexformat) {
01309 case RGB3F:
01310 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity,
01311 (float3 *) gpuh->devvoltexmap,
01312 gvsz, gorigin, bbox, true);
01313 break;
01314
01315 case RGB4U:
01316 mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity,
01317 (uchar4 *) gpuh->devvoltexmap,
01318 gvsz, gorigin, bbox, true);
01319 break;
01320 }
01321 if (!mcstat) {
01322 printf("QuickSurf call to MC SetVolumeData() failed\n");
01323 }
01324
01325
01326 if (skipstartplane || skipendplane) {
01327 uint3 volstart = make_uint3(0, 0, 0);
01328 uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01329
01330 if (skipstartplane)
01331 volstart.z = 2;
01332
01333 if (skipendplane)
01334 volend.z = gvsz.z - 2;
01335
01336 gpuh->mc->SetSubVolume(volstart, volend);
01337 }
01338 if (gpuh->n3b_d) {
01339 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3b_d,
01340 gpuh->c4u_d, chunkmaxverts);
01341 } else if (gpuh->c4u_d) {
01342 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d,
01343 gpuh->c4u_d, chunkmaxverts);
01344 } else {
01345 gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d,
01346 gpuh->c3f_d, chunkmaxverts);
01347 }
01348 chunknumverts = gpuh->mc->GetVertexCount();
01349
01350 #if 0
01351 printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01352 #endif
01353 if (chunknumverts == chunkmaxverts)
01354 printf(" *** QuickSurf exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01355
01356 cudaDeviceSynchronize();
01357 mckerneltime = wkf_timer_timenow(globaltimer);
01358
01359
01360 if (chunknumverts > 0) {
01361 DispCmdTriMesh cmdTriMesh;
01362 if (colorperatom) {
01363
01364 if (gpuh->n3b_d) {
01365 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01366 (const char *) gpuh->n3b_d,
01367 (const unsigned char *) gpuh->c4u_d,
01368 chunknumverts/3, cmdList);
01369 } else if (gpuh->c4u_d) {
01370 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01371 (const float *) gpuh->n3f_d,
01372 (const unsigned char *) gpuh->c4u_d,
01373 chunknumverts/3, cmdList);
01374 } else {
01375 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01376 (const float *) gpuh->n3f_d,
01377 (const float *) gpuh->c3f_d,
01378 chunknumverts/3, cmdList);
01379 }
01380 } else {
01381
01382 if (gpuh->n3b_d) {
01383 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01384 (const char *) gpuh->n3b_d,
01385 (const unsigned char *) NULL,
01386 chunknumverts/3, cmdList);
01387 } else {
01388 cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01389 (const float *) gpuh->n3f_d,
01390 (const float *) NULL,
01391 chunknumverts/3, cmdList);
01392 }
01393 }
01394 }
01395 numverts+=chunknumverts;
01396 numfacets+=chunknumverts/3;
01397
01398 #if 0
01399
01400
01401
01402 int l;
01403 int vertstart = 3 * numverts;
01404 int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01405 int facebufsz = (numverts + chunknumverts) * sizeof(int);
01406 int chunkvertsz = 3 * chunknumverts * sizeof(float);
01407
01408 v = (float*) realloc(v, vertbufsz);
01409 n = (float*) realloc(n, vertbufsz);
01410 c = (float*) realloc(c, vertbufsz);
01411 f = (int*) realloc(f, facebufsz);
01412 cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01413 cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01414 if (colorperatom) {
01415 cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01416 } else {
01417 float *color = c+vertstart;
01418 for (l=0; l<chunknumverts*3; l+=3) {
01419 color[l + 0] = colors[0].x;
01420 color[l + 1] = colors[0].y;
01421 color[l + 2] = colors[0].z;
01422 }
01423 }
01424 for (l=numverts; l<numverts+chunknumverts; l++) {
01425 f[l]=l;
01426 }
01427 numverts+=chunknumverts;
01428 numfacets+=chunknumverts/3;
01429 #endif
01430
01431 copycalltime = wkf_timer_timenow(globaltimer);
01432
01433 densitytime += densitykerneltime - lastlooptime;
01434 mctime += mckerneltime - densitykerneltime;
01435 copytime += copycalltime - mckerneltime;
01436
01437 lastlooptime = wkf_timer_timenow(globaltimer);
01438
01439 chunkcount++;
01440 }
01441
01442
01443
01444 cudaError_t err = cudaGetLastError();
01445
01446 wkf_timer_stop(globaltimer);
01447 double totalruntime = wkf_timer_time(globaltimer);
01448 wkf_timer_destroy(globaltimer);
01449
01450
01451
01452 if (err != cudaSuccess) {
01453 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01454 return -1;
01455 }
01456
01457 if (gpuh->verbose) {
01458 printf(" GPU generated %d vertices, %d facets, in %d passes\n",
01459 numverts, numfacets, chunkcount);
01460 printf(" GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n",
01461 "SM >= 2.x", totalruntime, sorttime, densitytime, mctime, copytime);
01462 }
01463
01464 return 0;
01465 }
01466
01467
01468
01469
01470