00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00042 #include <stdio.h>
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <cuda.h>
00046 #if CUDART_VERSION >= 9000
00047 #include <cuda_fp16.h>  
00048 #endif
00049 
00050 #if CUDART_VERSION < 4000
00051 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00052 #endif
00053 
00054 #include "Inform.h"
00055 #include "utilities.h"
00056 #include "WKFThreads.h"
00057 #include "WKFUtils.h"
00058 #include "CUDAKernels.h" 
00059 #include "CUDASpatialSearch.h"
00060 #include "CUDAMarchingCubes.h"
00061 #include "CUDAQuickSurf.h" 
00062 
00063 #include "DispCmds.h"
00064 #include "VMDDisplayList.h"
00065 
00066 #if 1
00067 #define CUERR { cudaError_t err; \
00068   if ((err = cudaGetLastError()) != cudaSuccess) { \
00069   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00070   printf("Thread aborting...\n"); \
00071   return NULL; }}
00072 #else
00073 #define CUERR
00074 #endif
00075 
00076 
00077 
00078 
00079 
00080 
00081 
00082 inline __host__ __device__ float3 operator*(float3 a, float b) {
00083   return make_float3(b * a.x, b * a.y, b * a.z);
00084 }
00085 
00086 
00087 inline __host__ __device__ void operator+=(float3 &a, float3 b) {
00088   a.x += b.x;
00089   a.y += b.y;
00090   a.z += b.z;
00091 }
00092 
00093 
00094 inline __device__ float3 make_float3(float4 a) {
00095   float3 b;
00096   b.x = a.x;
00097   b.y = a.y;
00098   b.z = a.z;
00099   return b;
00100 }
00101 
00102 
00103 
00104 
00105 
00106 
00107 
00108 inline __device__ void convert_density(float & df, float df2) {
00109   df = df2;
00110 }
00111 
00112 
00113 
00114 inline __device__ void convert_density(unsigned short & dh, float df2) {
00115   dh = __float2half_rn(df2);
00116 }
00117 
00118 
00119 
00120 
00121 
00122 
00123 
00124 
00125 inline __device__ void convert_color(float3 & cf, float3 cf2, float invisovalue) {
00126   cf = cf2 * invisovalue;
00127 }
00128 
00129 
00130 
00131 
00132 inline __device__ void convert_color(uchar4 & cu, float3 cf, float invisovalue) {
00133   
00134   
00135 
00136 #if 1
00137   float3 cftmp = cf * invisovalue;
00138   cu = make_uchar4(fminf(cftmp.x * 255.0f, 255.0f),
00139                    fminf(cftmp.y * 255.0f, 255.0f),
00140                    fminf(cftmp.z * 255.0f, 255.0f),
00141                    255); 
00142 #else
00143   
00144   
00145   float invmaxcolscale = __frcp_rn(fmaxf(fmaxf(fmaxf(cf.x, cf.y), cf.z), 1.0f)) * 255.0f;
00146 
00147   
00148   cu = make_uchar4(cf.x * invmaxcolscale,
00149                    cf.y * invmaxcolscale,
00150                    cf.z * invmaxcolscale,
00151                    255);
00152 #endif
00153 }
00154 
00155 
00156 
00157 inline __device__ void convert_color(float3 & cf, uchar4 cu) {
00158   const float i2f = 1.0f / 255.0f;
00159   cf.x = cu.x * i2f;
00160   cf.y = cu.y * i2f;
00161   cf.z = cu.z * i2f;
00162 }
00163 
00164 
00165 
00166 
00167 
00168 #if 1 && (CUDART_VERSION >= 4000)
00169 #define RESTRICT __restrict__
00170 #else
00171 #define RESTRICT
00172 #endif
00173 
00174 
00175 
00176 
00177 #define GGRIDSZ   8.0f
00178 #define GBLOCKSZX 8
00179 #define GBLOCKSZY 8
00180 
00181 #if 1
00182 #define GTEXBLOCKSZZ 2
00183 #define GTEXUNROLL   4
00184 #define GBLOCKSZZ    2
00185 #define GUNROLL      4
00186 #else
00187 #define GTEXBLOCKSZZ 8
00188 #define GTEXUNROLL   1
00189 #define GBLOCKSZZ    8
00190 #define GUNROLL      1
00191 #endif
00192 
00193 #define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
00194 #if __CUDA_ARCH__ >= 600
00195 #define MINBLOCKDENS 16
00196 #elif __CUDA_ARCH__ >= 300
00197 #define MINBLOCKDENS 16
00198 #elif __CUDA_ARCH__ >= 200
00199 #define MINBLOCKDENS 1
00200 #else
00201 #define MINBLOCKDENS 1
00202 #endif
00203 
00204 
00205 
00206 
00207 
00208 
00209 
00210 
00211 template<class DENSITY, class VOLTEX>
00212 __global__ static void 
00213 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00214 gaussdensity_fast_tex_norm(int natoms,
00215                       const float4 * RESTRICT sorted_xyzr, 
00216                       const float4 * RESTRICT sorted_color, 
00217                       int3 volsz,
00218                       int3 acncells,
00219                       float acgridspacing,
00220                       float invacgridspacing,
00221                       const uint2 * RESTRICT cellStartEnd,
00222                       float gridspacing, unsigned int z, 
00223                       DENSITY * RESTRICT densitygrid,
00224                       VOLTEX * RESTRICT voltexmap,
00225                       float invisovalue) {
00226   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00227   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00228   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00229 
00230   
00231   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00232                          yindex * volsz.x + xindex;
00233 
00234   
00235   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00236     return;
00237 
00238   zindex += z;
00239 
00240   
00241   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00242   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00243   int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00244 
00245   
00246   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00247   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00248   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00249 
00250   xabmin = (xabmin < 0) ? 0 : xabmin;
00251   yabmin = (yabmin < 0) ? 0 : yabmin;
00252   zabmin = (zabmin < 0) ? 0 : zabmin;
00253   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00254   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00255   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00256 
00257   float coorx = gridspacing * xindex;
00258   float coory = gridspacing * yindex;
00259   float coorz = gridspacing * zindex;
00260 
00261   float densityval1=0.0f;
00262   float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00263 #if GTEXUNROLL >= 2
00264   float densityval2=0.0f;
00265   float3 densitycol2=densitycol1;
00266 #endif
00267 #if GTEXUNROLL >= 4
00268   float densityval3=0.0f;
00269   float3 densitycol3=densitycol1;
00270   float densityval4=0.0f;
00271   float3 densitycol4=densitycol1;
00272 #endif
00273 
00274   int acplanesz = acncells.x * acncells.y;
00275   int xab, yab, zab;
00276   for (zab=zabmin; zab<=zabmax; zab++) {
00277     for (yab=yabmin; yab<=yabmax; yab++) {
00278       for (xab=xabmin; xab<=xabmax; xab++) {
00279         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00280         
00281         
00282         uint2 atomstartend = cellStartEnd[abcellidx];
00283         if (atomstartend.x != GRID_CELL_EMPTY) {
00284           unsigned int atomid;
00285           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00286             float4 atom  = sorted_xyzr[atomid];
00287             float4 color = sorted_color[atomid];
00288             float dx = coorx - atom.x;
00289             float dy = coory - atom.y;
00290             float dxy2 = dx*dx + dy*dy;
00291 
00292             float dz = coorz - atom.z;
00293             float r21 = (dxy2 + dz*dz) * atom.w;
00294             float tmp1 = exp2f(r21);
00295             densityval1 += tmp1;
00296             densitycol1.x += tmp1 * color.x;
00297             densitycol1.y += tmp1 * color.y;
00298             densitycol1.z += tmp1 * color.z;
00299 
00300 #if GTEXUNROLL >= 2
00301             float dz2 = dz + gridspacing;
00302             float r22 = (dxy2 + dz2*dz2) * atom.w;
00303             float tmp2 = exp2f(r22);
00304             densityval2 += tmp2;
00305             densitycol2.x += tmp2 * color.x;
00306             densitycol2.y += tmp2 * color.y;
00307             densitycol2.z += tmp2 * color.z;
00308 #endif
00309 #if GTEXUNROLL >= 4
00310             float dz3 = dz2 + gridspacing;
00311             float r23 = (dxy2 + dz3*dz3) * atom.w;
00312             float tmp3 = exp2f(r23);
00313             densityval3 += tmp3;
00314             densitycol3.x += tmp3 * color.x;
00315             densitycol3.y += tmp3 * color.y;
00316             densitycol3.z += tmp3 * color.z;
00317 
00318             float dz4 = dz3 + gridspacing;
00319             float r24 = (dxy2 + dz4*dz4) * atom.w;
00320             float tmp4 = exp2f(r24);
00321             densityval4 += tmp4;
00322             densitycol4.x += tmp4 * color.x;
00323             densitycol4.y += tmp4 * color.y;
00324             densitycol4.z += tmp4 * color.z;
00325 #endif
00326           }
00327         }
00328       }
00329     }
00330   }
00331 
00332   DENSITY densityout;
00333   VOLTEX texout;
00334   convert_density(densityout, densityval1 * invisovalue);
00335   densitygrid[outaddr          ] = densityout;
00336   convert_color(texout, densitycol1, invisovalue);
00337   voltexmap[outaddr          ] = texout;
00338 
00339 #if GTEXUNROLL >= 2
00340   int planesz = volsz.x * volsz.y;
00341   convert_density(densityout, densityval2 * invisovalue);
00342   densitygrid[outaddr + planesz] = densityout;
00343   convert_color(texout, densitycol2, invisovalue);
00344   voltexmap[outaddr + planesz] = texout;
00345 #endif
00346 #if GTEXUNROLL >= 4
00347   convert_density(densityout, densityval3 * invisovalue);
00348   densitygrid[outaddr + 2*planesz] = densityout;
00349   convert_color(texout, densitycol3, invisovalue);
00350   voltexmap[outaddr + 2*planesz] = texout;
00351 
00352   convert_density(densityout, densityval4 * invisovalue);
00353   densitygrid[outaddr + 3*planesz] = densityout;
00354   convert_color(texout, densitycol4, invisovalue);
00355   voltexmap[outaddr + 3*planesz] = texout;
00356 #endif
00357 }
00358 
00359 
00360 __global__ static void 
00361 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00362 gaussdensity_fast_tex3f(int natoms,
00363                         const float4 * RESTRICT sorted_xyzr, 
00364                         const float4 * RESTRICT sorted_color, 
00365                         int3 volsz,
00366                         int3 acncells,
00367                         float acgridspacing,
00368                         float invacgridspacing,
00369                         const uint2 * RESTRICT cellStartEnd,
00370                         float gridspacing, unsigned int z, 
00371                         float * RESTRICT densitygrid,
00372                         float3 * RESTRICT voltexmap,
00373                         float invisovalue) {
00374   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00375   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00376   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00377 
00378   
00379   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00380                          yindex * volsz.x + xindex;
00381 
00382   
00383   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00384     return;
00385 
00386   zindex += z;
00387 
00388   
00389   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00390   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00391   int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00392 
00393   
00394   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00395   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00396   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00397 
00398   xabmin = (xabmin < 0) ? 0 : xabmin;
00399   yabmin = (yabmin < 0) ? 0 : yabmin;
00400   zabmin = (zabmin < 0) ? 0 : zabmin;
00401   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00402   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00403   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00404 
00405   float coorx = gridspacing * xindex;
00406   float coory = gridspacing * yindex;
00407   float coorz = gridspacing * zindex;
00408 
00409   float densityval1=0.0f;
00410   float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00411 #if GTEXUNROLL >= 2
00412   float densityval2=0.0f;
00413   float3 densitycol2=densitycol1;
00414 #endif
00415 #if GTEXUNROLL >= 4
00416   float densityval3=0.0f;
00417   float3 densitycol3=densitycol1;
00418   float densityval4=0.0f;
00419   float3 densitycol4=densitycol1;
00420 #endif
00421 
00422   int acplanesz = acncells.x * acncells.y;
00423   int xab, yab, zab;
00424   for (zab=zabmin; zab<=zabmax; zab++) {
00425     for (yab=yabmin; yab<=yabmax; yab++) {
00426       for (xab=xabmin; xab<=xabmax; xab++) {
00427         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00428         
00429         
00430         uint2 atomstartend = cellStartEnd[abcellidx];
00431         if (atomstartend.x != GRID_CELL_EMPTY) {
00432           unsigned int atomid;
00433           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00434             float4 atom  = sorted_xyzr[atomid];
00435             float3 color = make_float3(sorted_color[atomid]);
00436             float dx = coorx - atom.x;
00437             float dy = coory - atom.y;
00438             float dxy2 = dx*dx + dy*dy;
00439 
00440             float dz = coorz - atom.z;
00441             float r21 = (dxy2 + dz*dz) * atom.w;
00442             float tmp1 = exp2f(r21);
00443             densityval1 += tmp1;
00444             densitycol1 += color * tmp1;
00445 
00446 #if GTEXUNROLL >= 2
00447             float dz2 = dz + gridspacing;
00448             float r22 = (dxy2 + dz2*dz2) * atom.w;
00449             float tmp2 = exp2f(r22);
00450             densityval2 += tmp2;
00451             densitycol2 += color * tmp2;
00452 #endif
00453 #if GTEXUNROLL >= 4
00454             float dz3 = dz2 + gridspacing;
00455             float r23 = (dxy2 + dz3*dz3) * atom.w;
00456             float tmp3 = exp2f(r23);
00457             densityval3 += tmp3;
00458             densitycol3 += color * tmp3;
00459 
00460             float dz4 = dz3 + gridspacing;
00461             float r24 = (dxy2 + dz4*dz4) * atom.w;
00462             float tmp4 = exp2f(r24);
00463             densityval4 += tmp4;
00464             densitycol4 += color * tmp4;
00465 #endif
00466           }
00467         }
00468       }
00469     }
00470   }
00471 
00472   float3 texout;
00473   densitygrid[outaddr            ] = densityval1;
00474   convert_color(texout, densitycol1, invisovalue);
00475   voltexmap[outaddr            ] = texout;
00476 
00477 #if GTEXUNROLL >= 2
00478   int planesz = volsz.x * volsz.y;
00479   densitygrid[outaddr +   planesz] = densityval2;
00480   convert_color(texout, densitycol2, invisovalue);
00481   voltexmap[outaddr +   planesz] = texout;
00482 #endif
00483 #if GTEXUNROLL >= 4
00484   densitygrid[outaddr + 2*planesz] = densityval3;
00485   convert_color(texout, densitycol3, invisovalue);
00486   voltexmap[outaddr + 2*planesz] = texout;
00487 
00488   densitygrid[outaddr + 3*planesz] = densityval4;
00489   convert_color(texout, densitycol4, invisovalue);
00490   voltexmap[outaddr + 3*planesz] = texout;
00491 #endif
00492 }
00493 
00494 
00495 __global__ static void 
00496 
00497 gaussdensity_fast(int natoms,
00498                   const float4 * RESTRICT sorted_xyzr, 
00499                   int3 volsz,
00500                   int3 acncells,
00501                   float acgridspacing,
00502                   float invacgridspacing,
00503                   const uint2 * RESTRICT cellStartEnd,
00504                   float gridspacing, unsigned int z, 
00505                   float * RESTRICT densitygrid) {
00506   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00507   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00508   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00509   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00510                          yindex * volsz.x + 
00511                          xindex;
00512 
00513   
00514   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00515     return;
00516 
00517   zindex += z;
00518 
00519   
00520   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00521   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00522   int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00523 
00524   
00525   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00526   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00527   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00528 
00529   xabmin = (xabmin < 0) ? 0 : xabmin;
00530   yabmin = (yabmin < 0) ? 0 : yabmin;
00531   zabmin = (zabmin < 0) ? 0 : zabmin;
00532   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00533   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00534   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00535 
00536   float coorx = gridspacing * xindex;
00537   float coory = gridspacing * yindex;
00538   float coorz = gridspacing * zindex;
00539 
00540   float densityval1=0.0f;
00541 #if GUNROLL >= 2
00542   float densityval2=0.0f;
00543 #endif
00544 #if GUNROLL >= 4
00545   float densityval3=0.0f;
00546   float densityval4=0.0f;
00547 #endif
00548 
00549   int acplanesz = acncells.x * acncells.y;
00550   int xab, yab, zab;
00551   for (zab=zabmin; zab<=zabmax; zab++) {
00552     for (yab=yabmin; yab<=yabmax; yab++) {
00553       for (xab=xabmin; xab<=xabmax; xab++) {
00554         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00555         uint2 atomstartend = cellStartEnd[abcellidx];
00556         if (atomstartend.x != GRID_CELL_EMPTY) {
00557           unsigned int atomid;
00558           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00559             float4 atom = sorted_xyzr[atomid];
00560             float dx = coorx - atom.x;
00561             float dy = coory - atom.y;
00562             float dxy2 = dx*dx + dy*dy;
00563   
00564             float dz = coorz - atom.z;
00565             float r21 = (dxy2 + dz*dz) * atom.w;
00566             densityval1 += exp2f(r21);
00567 
00568 #if GUNROLL >= 2
00569             float dz2 = dz + gridspacing;
00570             float r22 = (dxy2 + dz2*dz2) * atom.w;
00571             densityval2 += exp2f(r22);
00572 #endif
00573 #if GUNROLL >= 4
00574             float dz3 = dz2 + gridspacing;
00575             float r23 = (dxy2 + dz3*dz3) * atom.w;
00576             densityval3 += exp2f(r23);
00577 
00578             float dz4 = dz3 + gridspacing;
00579             float r24 = (dxy2 + dz4*dz4) * atom.w;
00580             densityval4 += exp2f(r24);
00581 #endif
00582           }
00583         }
00584       }
00585     }
00586   }
00587 
00588   densitygrid[outaddr            ] = densityval1;
00589 #if GUNROLL >= 2
00590   int planesz = volsz.x * volsz.y;
00591   densitygrid[outaddr +   planesz] = densityval2;
00592 #endif
00593 #if GUNROLL >= 4
00594   densitygrid[outaddr + 2*planesz] = densityval3;
00595   densitygrid[outaddr + 3*planesz] = densityval4;
00596 #endif
00597 }
00598 
00599 
00600 
00601 typedef struct {
00602   cudaDeviceProp deviceProp; 
00603   int dev;                   
00604   int verbose;               
00605 
00607   long int natoms;           
00608   int colorperatom;          
00609   CUDAQuickSurf::VolTexFormat vtexformat;   
00610   int acx;                   
00611   int acy;                   
00612   int acz;                   
00613   int gx;                    
00614   int gy;                    
00615   int gz;                    
00616 
00617   CUDAMarchingCubes *mc;     
00618 
00619   float *devdensity;         
00620   void *devvoltexmap;        
00621   float4 *xyzr_d;            
00622   float4 *sorted_xyzr_d;     
00623   float4 *color_d;           
00624   float4 *sorted_color_d;    
00625 
00626   unsigned int *atomIndex_d; 
00627   unsigned int *atomHash_d;  
00628   uint2 *cellStartEnd_d;     
00629 
00630   void *safety;              
00631 
00632   float3 *v3f_d;             
00633   float3 *n3f_d;             
00634   float3 *c3f_d;             
00635   char3 *n3b_d;              
00636   uchar4 *c4u_d;             
00637 } qsurf_gpuhandle;
00638 
00639 
00640 CUDAQuickSurf::CUDAQuickSurf() {
00641   voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
00642   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00643 
00644   if (getenv("VMDQUICKSURFVERBOSE") != NULL) {
00645     gpuh->verbose = 1;
00646     int tmp = atoi(getenv("VMDQUICKSURFVERBOSE"));
00647     if (tmp > 0)
00648       gpuh->verbose = tmp;
00649   }
00650 
00651   gpuh->vtexformat = RGB3F; 
00652 
00653   if (cudaGetDevice(&gpuh->dev) != cudaSuccess) {
00654     gpuh->dev = -1; 
00655   }
00656 
00657   if (cudaGetDeviceProperties(&gpuh->deviceProp, gpuh->dev) != cudaSuccess) {
00658     cudaError_t err = cudaGetLastError(); 
00659     gpuh->dev = -1; 
00660   }
00661 }
00662 
00663 
00664 CUDAQuickSurf::~CUDAQuickSurf() {
00665   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00666 
00667   
00668   free_bufs();
00669 
00670   
00671   delete gpuh->mc;
00672 
00673   free(voidgpu);
00674 }
00675 
00676 
00677 int CUDAQuickSurf::free_bufs() {
00678   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00679 
00680   
00681   gpuh->natoms = 0;
00682   gpuh->colorperatom = 0;
00683   gpuh->acx = 0;
00684   gpuh->acy = 0;
00685   gpuh->acz = 0;
00686   gpuh->gx = 0;
00687   gpuh->gy = 0;
00688   gpuh->gz = 0;
00689 
00690   if (gpuh->safety != NULL)
00691     cudaFree(gpuh->safety);
00692   gpuh->safety=NULL;
00693 
00694   if (gpuh->devdensity != NULL)
00695     cudaFree(gpuh->devdensity);
00696   gpuh->devdensity=NULL;
00697 
00698   if (gpuh->devvoltexmap != NULL)
00699     cudaFree(gpuh->devvoltexmap);
00700   gpuh->devvoltexmap=NULL;
00701 
00702   if (gpuh->xyzr_d != NULL)
00703     cudaFree(gpuh->xyzr_d);
00704   gpuh->xyzr_d=NULL;
00705 
00706   if (gpuh->sorted_xyzr_d != NULL)
00707     cudaFree(gpuh->sorted_xyzr_d);  
00708   gpuh->sorted_xyzr_d=NULL;
00709 
00710   if (gpuh->color_d != NULL)
00711     cudaFree(gpuh->color_d);
00712   gpuh->color_d=NULL;
00713 
00714   if (gpuh->sorted_color_d != NULL)
00715     cudaFree(gpuh->sorted_color_d);
00716   gpuh->sorted_color_d=NULL;
00717 
00718   if (gpuh->atomIndex_d != NULL)
00719     cudaFree(gpuh->atomIndex_d);
00720   gpuh->atomIndex_d=NULL;
00721 
00722   if (gpuh->atomHash_d != NULL)
00723     cudaFree(gpuh->atomHash_d);
00724   gpuh->atomHash_d=NULL;
00725 
00726   if (gpuh->cellStartEnd_d != NULL)
00727     cudaFree(gpuh->cellStartEnd_d);
00728   gpuh->cellStartEnd_d=NULL;
00729 
00730   if (gpuh->v3f_d != NULL)
00731     cudaFree(gpuh->v3f_d);
00732   gpuh->v3f_d=NULL;
00733 
00734   if (gpuh->n3f_d != NULL)
00735     cudaFree(gpuh->n3f_d);
00736   gpuh->n3f_d=NULL;
00737 
00738   if (gpuh->c3f_d != NULL)
00739     cudaFree(gpuh->c3f_d);
00740   gpuh->c3f_d=NULL;
00741 
00742   if (gpuh->n3b_d != NULL)
00743     cudaFree(gpuh->n3b_d);
00744   gpuh->n3b_d=NULL;
00745 
00746   if (gpuh->c4u_d != NULL)
00747     cudaFree(gpuh->c4u_d);
00748   gpuh->c4u_d=NULL;
00749 
00750 
00751   return 0;
00752 }
00753 
00754 
00755 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom, 
00756                               VolTexFormat vtexformat, 
00757                               int acx, int acy, int acz,
00758                               int gx, int gy, int gz) {
00759   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00760 
00761   
00762   
00763   
00764   
00765   
00766   if (natoms <= gpuh->natoms &&
00767       colorperatom <= gpuh->colorperatom &&
00768       vtexformat == gpuh->vtexformat &&
00769       (acx*acy*acz) <= (gpuh->acx * gpuh->acy * gpuh->acz) && 
00770       (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
00771     return 0;
00772  
00773   return -1; 
00774 }
00775 
00776 
00777 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom, 
00778                               VolTexFormat vtexformat, 
00779                               int acx, int acy, int acz,
00780                               int gx, int gy, int gz) {
00781   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00782 
00783   
00784   
00785   if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, gx, gy, gz) == 0)
00786     return 0;
00787 
00788   
00789   
00790   free_bufs();
00791 
00792   long int acncells = ((long) acx) * ((long) acy) * ((long) acz);
00793   long int ncells = ((long) gx) * ((long) gy) * ((long) gz);
00794   long int volmemsz = ncells * sizeof(float);
00795   long int chunkmaxverts = 3L * ncells; 
00796   long int MCsz = CUDAMarchingCubes::MemUsageMC(gx, gy, gz);
00797 
00798   
00799   
00800   
00801   long int totalmemsz = 
00802     volmemsz +                                       
00803     (2L * natoms * sizeof(unsigned int)) +           
00804     (acncells * sizeof(uint2)) +                     
00805     (3L * chunkmaxverts * sizeof(float3)) +          
00806     natoms*sizeof(float4) +                          
00807     8L * gx * gy * sizeof(float) +                   
00808     MCsz;                                            
00809 
00810   cudaMalloc((void**)&gpuh->devdensity, volmemsz);
00811   if (colorperatom) {
00812     int voltexsz = 0;
00813     switch (vtexformat) {
00814       case RGB3F:
00815         voltexsz = ncells * sizeof(float3);
00816         break;
00817 
00818       case RGB4U:
00819         voltexsz = ncells * sizeof(uchar4);
00820         break;
00821     }
00822     gpuh->vtexformat = vtexformat;
00823 
00824     cudaMalloc((void**)&gpuh->devvoltexmap, voltexsz);
00825     cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
00826     cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
00827     totalmemsz += 2 * natoms * sizeof(float4);
00828   }
00829   cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
00830   cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
00831   cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
00832   cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
00833   cudaMalloc((void**)&gpuh->cellStartEnd_d, acncells * sizeof(uint2));
00834 
00835   
00836   cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
00837 #if 1
00838   
00839   cudaMalloc((void**)&gpuh->n3b_d, 3 * chunkmaxverts * sizeof(char3));
00840   totalmemsz += 3 * chunkmaxverts * sizeof(char3);   
00841 #else
00842   cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
00843   totalmemsz += 3 * chunkmaxverts * sizeof(float3);  
00844 #endif
00845 #if 1
00846   cudaMalloc((void**)&gpuh->c4u_d, 3 * chunkmaxverts * sizeof(uchar4));
00847   totalmemsz += 3 * chunkmaxverts * sizeof(uchar4);  
00848 #else
00849   cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
00850   totalmemsz += 3 * chunkmaxverts * sizeof(float3);  
00851 #endif
00852 
00853   
00854   
00855   
00856   
00857   
00858   cudaMalloc(&gpuh->safety, 
00859              natoms*sizeof(float4) +                          
00860              8 * gx * gy * sizeof(float) +                    
00861              MCsz);                                           
00862 
00863   if (gpuh->verbose > 1)
00864     printf("Total QuickSurf mem size: %ld MB\n", totalmemsz / (1024*1024));
00865 
00866   cudaError_t err = cudaGetLastError();
00867   if (err != cudaSuccess)
00868     return -1;
00869 
00870   
00871   
00872   gpuh->natoms = natoms;
00873   gpuh->colorperatom = colorperatom;
00874 
00875   gpuh->acx = acx;
00876   gpuh->acy = acy;
00877   gpuh->acz = acz;
00878 
00879   gpuh->gx = gx;
00880   gpuh->gy = gy;
00881   gpuh->gz = gz;
00882 
00883   return 0;
00884 }
00885 
00886 
00887 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
00888                                   long int natoms, int colorperatom, 
00889                                   VolTexFormat vtexformat,
00890                                   int acx, int acy, int acz,
00891                                   int gx, int gy, int gz,
00892                                   int &cx, int &cy, int &cz,
00893                                   int &sx, int &sy, int &sz) {
00894   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
00895   if (colorperatom)
00896     Bsz.z = GTEXBLOCKSZZ;
00897 
00898   cudaError_t err = cudaGetLastError(); 
00899 
00900   
00901   
00902   
00903   
00904   cz <<= 1; 
00905   int chunkiters = 0;
00906   int chunkallocated = 0;
00907   while (!chunkallocated) {
00908     
00909     chunkiters++;
00910     cz >>= 1;
00911 
00912     
00913     
00914     
00915     
00916     if (cz != gz)
00917       cz-=4;
00918 
00919     
00920     
00921     cz += (8 - (cz % 8));
00922 
00923     
00924     
00925     
00926     sx = cx;
00927     sy = cy;
00928     sz = cz;
00929 
00930     
00931     
00932     cz+=4;
00933 
00934 #if 0
00935     printf("  Trying slab size: %d (test: %d)\n", sz, testexisting);
00936 #endif
00937 
00938 #if 1
00939     
00940     
00941     dim3 tGsz((sx+Bsz.x-1) / Bsz.x, 
00942               (sy+Bsz.y-1) / Bsz.y,
00943               (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
00944     if (colorperatom) {
00945       tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
00946     }
00947     if (tGsz.x * tGsz.y * tGsz.z > 65535)
00948       continue; 
00949 #endif
00950 
00951     
00952     
00953     
00954     if (sz <= 8) {
00955       return -1;
00956     }
00957  
00958     if (testexisting) {
00959       if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00960         continue;
00961     } else {
00962       if (alloc_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00963         continue;
00964     }
00965 
00966     chunkallocated=1;
00967   }
00968 
00969   return 0;
00970 }
00971 
00972 
00973 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f, 
00974                              const float *colors_f,
00975                              int colorperatom, VolTexFormat voltexformat,
00976                              float *origin, int *numvoxels, float maxrad,
00977                              float radscale, float gridspacing, 
00978                              float isovalue, float gausslim,
00979                              VMDDisplayList *cmdList) {
00980   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00981 
00982   
00983   
00984   if (gpuh->dev < 0)
00985     return -1;
00986 
00987   
00988   
00989   
00990   
00991   
00992   if (gpuh->deviceProp.major < 2) {
00993     return -1;
00994   }
00995 
00996   
00997   wkf_timerhandle globaltimer = wkf_timer_create();
00998   wkf_timer_start(globaltimer);
00999 
01000   int vtexsize = 0;
01001 
01002 
01003   switch (voltexformat) {
01004     case RGB3F: 
01005       vtexsize = sizeof(float3);
01006       break;
01007 
01008     case RGB4U: 
01009       vtexsize = sizeof(uchar4);
01010       break;
01011   }
01012 
01013   float4 *colors = (float4 *) colors_f;
01014   int chunkmaxverts=0;
01015   int chunknumverts=0; 
01016   int numverts=0;
01017   int numfacets=0;
01018 
01019   
01020   float acgridspacing = gausslim * radscale * maxrad;
01021 
01022   
01023   if (acgridspacing < gridspacing)
01024     acgridspacing = gridspacing;
01025 
01026   
01027   
01028   
01029   int3 volsz = make_int3(numvoxels[0], numvoxels[1], numvoxels[2]);
01030   int3 chunksz = volsz;
01031   int3 slabsz = volsz;
01032 
01033   
01034   if (getenv("VMDQUICKSURFMINMEM")) {
01035     if (volsz.z > 32) {
01036       slabsz.z = chunksz.z = 16;
01037     }
01038   }
01039 
01040   int3 accelcells;
01041   accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01042   accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01043   accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01044 
01045   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01046   if (colorperatom)
01047     Bsz.z = GTEXBLOCKSZZ;
01048 
01049   
01050   
01051   
01052   if (gpuh->natoms == 0 ||
01053       get_chunk_bufs(1, natoms, colorperatom, voltexformat,
01054                      accelcells.x, accelcells.y, accelcells.z,
01055                      volsz.x, volsz.y, volsz.z,
01056                      chunksz.x, chunksz.y, chunksz.z,
01057                      slabsz.x, slabsz.y, slabsz.z) == -1) {
01058     
01059     
01060     chunksz = volsz;
01061     slabsz = volsz;
01062 
01063     
01064     
01065     if (get_chunk_bufs(0, natoms, colorperatom, voltexformat,
01066                        accelcells.x, accelcells.y, accelcells.z,
01067                        volsz.x, volsz.y, volsz.z,
01068                        chunksz.x, chunksz.y, chunksz.z,
01069                        slabsz.x, slabsz.y, slabsz.z) == -1) {
01070       wkf_timer_destroy(globaltimer);
01071       return -1;
01072     }
01073   }
01074   chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01075 
01076   
01077   
01078   if (gpuh->safety != NULL)
01079     cudaFree(gpuh->safety);
01080   gpuh->safety = NULL;
01081 
01082   if (gpuh->verbose > 1) {
01083     printf("  Using GPU chunk size: %d\n", chunksz.z);
01084     printf("  Accel grid(%d, %d, %d) spacing %f\n",
01085            accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01086   }
01087 
01088   
01089   
01090   int i, i4;
01091   float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01092   float log2e = log2f(2.718281828f);
01093   for (i=0,i4=0; i<natoms; i++,i4+=4) {
01094     xyzr[i].x = xyzr_f[i4    ];
01095     xyzr[i].y = xyzr_f[i4 + 1];
01096     xyzr[i].z = xyzr_f[i4 + 2];
01097 
01098     float scaledrad = xyzr_f[i4 + 3] * radscale;
01099     float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01100 
01101     xyzr[i].w = arinv;
01102   }
01103   cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01104   free(xyzr);
01105 
01106   if (colorperatom)
01107     cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01108  
01109   
01110   if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01111                                        gpuh->sorted_xyzr_d,
01112                                        gpuh->sorted_color_d,
01113                                        gpuh->atomIndex_d, gpuh->atomHash_d,
01114                                        gpuh->cellStartEnd_d, 
01115                                        accelcells, 1.0f / acgridspacing) != 0) {
01116     wkf_timer_destroy(globaltimer);
01117     free_bufs();
01118     return -1;
01119   }
01120 
01121   double sorttime = wkf_timer_timenow(globaltimer);
01122   double lastlooptime = sorttime;
01123 
01124   double densitykerneltime = 0.0f;
01125   double densitytime = 0.0f;
01126   double mckerneltime = 0.0f;
01127   double mctime = 0.0f; 
01128   double copycalltime = 0.0f;
01129   double copytime = 0.0f;
01130 
01131   float *volslab_d = NULL;
01132   void *texslab_d = NULL;
01133 
01134   int lzplane = GBLOCKSZZ * GUNROLL;
01135   if (colorperatom)
01136     lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01137 
01138   
01139   uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01140   if (gpuh->mc == NULL) {
01141     gpuh->mc = new CUDAMarchingCubes(); 
01142     if (!gpuh->mc->Initialize(mgsz)) {
01143       printf("QuickSurf call to MC Initialize() failed\n");
01144     }
01145   } else {
01146     uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01147     if (slabsz.x > mcmaxgridsize.x ||
01148         slabsz.y > mcmaxgridsize.y ||
01149         slabsz.z > mcmaxgridsize.z) {
01150       if (gpuh->verbose)
01151         printf("*** QuickSurf Allocating new MC object...\n");
01152  
01153       
01154       delete gpuh->mc;
01155 
01156       
01157       gpuh->mc = new CUDAMarchingCubes(); 
01158 
01159       if (!gpuh->mc->Initialize(mgsz)) {
01160         printf("QuickSurf MC Initialize() call failed to recreate MC object\n");
01161       }
01162     } 
01163   }
01164 
01165   int z;
01166   int chunkcount=0;
01167   float invacgridspacing = 1.0f / acgridspacing;
01168   float invisovalue = 1.0f / isovalue;
01169   for (z=0; z<volsz.z; z+=slabsz.z) {
01170     int3 curslab = slabsz;
01171     if (z+curslab.z > volsz.z)
01172       curslab.z = volsz.z - z; 
01173   
01174     int slabplanesz = curslab.x * curslab.y;
01175 
01176     dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x, 
01177              (curslab.y+Bsz.y-1) / Bsz.y,
01178              (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01179     if (colorperatom)
01180       Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01181 
01182     
01183     
01184     dim3 Gszslice = Gsz;
01185 
01186     if (gpuh->verbose > 1) {
01187       printf("CUDA device %d, grid size %dx%dx%d\n", 
01188              0, Gsz.x, Gsz.y, Gsz.z);
01189       printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01190              curslab.x, curslab.y, curslab.z,
01191              accelcells.x, accelcells.y, accelcells.z);
01192       printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01193     }
01194 
01195     
01196     
01197     
01198     if (z == 0) {
01199       volslab_d = gpuh->devdensity;
01200       if (colorperatom)
01201         texslab_d = gpuh->devvoltexmap;
01202     } else {
01203       int fourplanes = 4 * slabplanesz;
01204       cudaMemcpy(gpuh->devdensity,
01205                  volslab_d + (slabsz.z-4) * slabplanesz, 
01206                  fourplanes * sizeof(float), cudaMemcpyDeviceToDevice);
01207       volslab_d = gpuh->devdensity + fourplanes;
01208 
01209       if (colorperatom) {
01210         cudaMemcpy(gpuh->devvoltexmap,
01211                    ((unsigned char *) texslab_d) + (slabsz.z-4) * slabplanesz * vtexsize, 
01212                    fourplanes * vtexsize, cudaMemcpyDeviceToDevice);
01213         texslab_d = ((unsigned char *) gpuh->devvoltexmap) + fourplanes * vtexsize;
01214       }
01215     }
01216 
01217     
01218     for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01219       int lzinc = lz * lzplane;
01220       float *volslice_d = volslab_d + lzinc * slabplanesz;
01221 
01222       if (colorperatom) {
01223         void *texslice_d = ((unsigned char *) texslab_d) + lzinc * slabplanesz * vtexsize;
01224         switch (voltexformat) {
01225           case RGB3F:
01226             gaussdensity_fast_tex3f<<<Gszslice, Bsz, 0>>>(natoms, 
01227                 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01228                 curslab, accelcells, acgridspacing, invacgridspacing, 
01229                 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01230                 volslice_d, (float3 *) texslice_d, invisovalue);
01231             break;
01232 
01233           case RGB4U:
01234             gaussdensity_fast_tex_norm<float, uchar4><<<Gszslice, Bsz, 0>>>(natoms, 
01235                 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01236                 curslab, accelcells, acgridspacing, invacgridspacing, 
01237                 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01238                 volslice_d, (uchar4 *) texslice_d, invisovalue);
01239             break;
01240         }
01241       } else {
01242         gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, gpuh->sorted_xyzr_d, 
01243             curslab, accelcells, acgridspacing, invacgridspacing, 
01244             gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01245       }
01246     }
01247     cudaDeviceSynchronize(); 
01248     densitykerneltime = wkf_timer_timenow(globaltimer);
01249 
01250 #if 0
01251     printf("  CUDA mcubes..."); fflush(stdout);
01252 #endif
01253 
01254     uint3 gvsz = make_uint3(curslab.x, curslab.y, curslab.z);
01255 
01256     
01257     
01258     
01259     if (z != 0)
01260       gvsz.z=curslab.z + 4;
01261 
01262     float3 bbox = make_float3(gvsz.x * gridspacing, gvsz.y * gridspacing,
01263                               gvsz.z * gridspacing);
01264 
01265     float3 gorigin = make_float3(origin[0], origin[1], 
01266                                  origin[2] + (z * gridspacing));
01267     if (z != 0)
01268       gorigin.z = origin[2] + ((z-4) * gridspacing);
01269 
01270 #if 0
01271 printf("\n  ... vsz: %d %d %d\n", gvsz.x, gvsz.y, gvsz.z);
01272 printf("  ... org: %.2f %.2f %.2f\n", gorigin.x, gorigin.y, gorigin.z);
01273 printf("  ... bxs: %.2f %.2f %.2f\n", bbox.x, bbox.y, bbox.y);
01274 printf("  ... bbe: %.2f %.2f %.2f\n", gorigin.x+bbox.x, gorigin.y+bbox.y, gorigin.z+bbox.z);
01275 #endif
01276 
01277     
01278     
01279     
01280     
01281     int skipstartplane=0;
01282     int skipendplane=0;
01283     if (chunksz.z < volsz.z) {
01284       
01285       if (z != 0)
01286         skipstartplane=1;
01287 
01288       
01289       if (z+curslab.z < volsz.z)
01290         skipendplane=1;
01291     }
01292 
01293     
01294     
01295     
01296 
01297     
01298     
01299     if (colorperatom && (voltexformat == RGB4U)) {
01300       
01301       
01302       gpuh->mc->SetIsovalue(1.0f);
01303     } else {
01304       gpuh->mc->SetIsovalue(isovalue);
01305     }
01306 
01307     int mcstat = 0;
01308     switch (voltexformat) {
01309       case RGB3F:
01310         mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01311                                          (float3 *) gpuh->devvoltexmap,
01312                                          gvsz, gorigin, bbox, true);
01313         break;
01314 
01315       case RGB4U:
01316         mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01317                                          (uchar4 *) gpuh->devvoltexmap,
01318                                          gvsz, gorigin, bbox, true);
01319         break;
01320     }
01321     if (!mcstat) {
01322       printf("QuickSurf call to MC SetVolumeData() failed\n");
01323     }
01324 
01325     
01326     if (skipstartplane || skipendplane) {
01327       uint3 volstart = make_uint3(0, 0, 0);
01328       uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01329 
01330       if (skipstartplane)
01331         volstart.z = 2;
01332 
01333       if (skipendplane)
01334         volend.z = gvsz.z - 2;
01335 
01336       gpuh->mc->SetSubVolume(volstart, volend);
01337     }
01338     if (gpuh->n3b_d) {
01339       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3b_d, 
01340                                   gpuh->c4u_d, chunkmaxverts);
01341     } else if (gpuh->c4u_d) {
01342       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01343                                   gpuh->c4u_d, chunkmaxverts);
01344     } else {
01345       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01346                                   gpuh->c3f_d, chunkmaxverts);
01347     }
01348     chunknumverts = gpuh->mc->GetVertexCount();
01349 
01350 #if 0
01351     printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01352 #endif
01353     if (chunknumverts == chunkmaxverts)
01354       printf("  *** QuickSurf exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01355 
01356     cudaDeviceSynchronize(); 
01357     mckerneltime = wkf_timer_timenow(globaltimer);
01358 
01359     
01360     if (chunknumverts > 0) {
01361       DispCmdTriMesh cmdTriMesh;
01362       if (colorperatom) {
01363         
01364         if (gpuh->n3b_d) {
01365           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01366                                   (const char *) gpuh->n3b_d, 
01367                                   (const unsigned char *) gpuh->c4u_d,
01368                                   chunknumverts/3, cmdList);
01369         } else if (gpuh->c4u_d) {
01370           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01371                                   (const float *) gpuh->n3f_d, 
01372                                   (const unsigned char *) gpuh->c4u_d,
01373                                   chunknumverts/3, cmdList);
01374         } else {
01375           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01376                                   (const float *) gpuh->n3f_d, 
01377                                   (const float *) gpuh->c3f_d,
01378                                   chunknumverts/3, cmdList);
01379         }
01380       } else {
01381         
01382         if (gpuh->n3b_d) {
01383           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01384                                   (const char *) gpuh->n3b_d, 
01385                                   (const unsigned char *) NULL,
01386                                   chunknumverts/3, cmdList);
01387         } else {
01388           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01389                                   (const float *) gpuh->n3f_d, 
01390                                   (const float *) NULL,
01391                                   chunknumverts/3, cmdList);
01392         }
01393       }
01394     }
01395     numverts+=chunknumverts;
01396     numfacets+=chunknumverts/3;
01397 
01398 #if 0
01399    
01400    
01401    
01402     int l;
01403     int vertstart = 3 * numverts;
01404     int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01405     int facebufsz = (numverts + chunknumverts) * sizeof(int);
01406     int chunkvertsz = 3 * chunknumverts * sizeof(float);
01407 
01408     v = (float*) realloc(v, vertbufsz);
01409     n = (float*) realloc(n, vertbufsz);
01410     c = (float*) realloc(c, vertbufsz);
01411     f = (int*)   realloc(f, facebufsz);
01412     cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01413     cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01414     if (colorperatom) {
01415       cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01416     } else {
01417       float *color = c+vertstart;
01418       for (l=0; l<chunknumverts*3; l+=3) {
01419         color[l + 0] = colors[0].x;
01420         color[l + 1] = colors[0].y;
01421         color[l + 2] = colors[0].z;
01422       }
01423     }
01424     for (l=numverts; l<numverts+chunknumverts; l++) {
01425       f[l]=l;
01426     }
01427     numverts+=chunknumverts;
01428     numfacets+=chunknumverts/3;
01429 #endif
01430 
01431     copycalltime = wkf_timer_timenow(globaltimer);
01432 
01433     densitytime += densitykerneltime - lastlooptime;
01434     mctime += mckerneltime - densitykerneltime;
01435     copytime += copycalltime - mckerneltime;
01436 
01437     lastlooptime = wkf_timer_timenow(globaltimer);
01438 
01439     chunkcount++; 
01440   }
01441 
01442   
01443   
01444   cudaError_t err = cudaGetLastError();
01445 
01446   wkf_timer_stop(globaltimer);
01447   double totalruntime = wkf_timer_time(globaltimer);
01448   wkf_timer_destroy(globaltimer);
01449 
01450   
01451   
01452   if (err != cudaSuccess) { 
01453     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01454     return -1;
01455   }
01456 
01457   if (gpuh->verbose) {
01458     printf("  GPU generated %d vertices, %d facets, in %d passes\n", 
01459            numverts, numfacets, chunkcount);
01460     printf("  GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n", 
01461            "SM >= 2.x", totalruntime, sorttime, densitytime, mctime, copytime);
01462   }
01463 
01464   return 0;
01465 }
01466 
01467 
01468 
01469 
01470