CUDAQuickSurf.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAQuickSurf.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.97 $      $Date: 2021/12/21 05:30:12 $
00014  *
00015  ***************************************************************************/
00042 #include <stdio.h>
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <cuda.h>
00046 #if CUDART_VERSION >= 9000
00047 #include <cuda_fp16.h>  // need to explicitly include for CUDA 9.0
00048 #endif
00049 
00050 #if CUDART_VERSION < 4000
00051 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00052 #endif
00053 
00054 #include "Inform.h"
00055 #include "utilities.h"
00056 #include "WKFThreads.h"
00057 #include "WKFUtils.h"
00058 #include "CUDAKernels.h" 
00059 #include "CUDASpatialSearch.h"
00060 #include "CUDAMarchingCubes.h"
00061 #include "CUDAQuickSurf.h" 
00062 
00063 #include "DispCmds.h"
00064 #include "VMDDisplayList.h"
00065 
00066 #if 1
00067 #define CUERR { cudaError_t err; \
00068   if ((err = cudaGetLastError()) != cudaSuccess) { \
00069   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00070   printf("Thread aborting...\n"); \
00071   return NULL; }}
00072 #else
00073 #define CUERR
00074 #endif
00075 
00076 
00077 //
00078 // general operator overload helper routines
00079 //
00080 
00081 // "*" operator
00082 inline __host__ __device__ float3 operator*(float3 a, float b) {
00083   return make_float3(b * a.x, b * a.y, b * a.z);
00084 }
00085 
00086 // "+=" operator
00087 inline __host__ __device__ void operator+=(float3 &a, float3 b) {
00088   a.x += b.x;
00089   a.y += b.y;
00090   a.z += b.z;
00091 }
00092 
00093 // down-convert float4 to float3
00094 inline __device__ float3 make_float3(float4 a) {
00095   float3 b;
00096   b.x = a.x;
00097   b.y = a.y;
00098   b.z = a.z;
00099   return b;
00100 }
00101 
00102 
00103 //
00104 // density format conversion routines
00105 //
00106 
00107 // no-op conversion for float to float
00108 inline __device__ void convert_density(float & df, float df2) {
00109   df = df2;
00110 }
00111 
00112 // Convert float (32-bit) to half-precision (16-bit floating point) stored
00113 // into an unsigned short (16-bit integer type). 
00114 inline __device__ void convert_density(unsigned short & dh, float df2) {
00115   dh = __float2half_rn(df2);
00116 }
00117 
00118 
00119 
00120 //
00121 // color format conversion routines
00122 //
00123 
00124 // conversion for float3 to float3
00125 inline __device__ void convert_color(float3 & cf, float3 cf2, float invisovalue) {
00126   cf = cf2 * invisovalue;
00127 }
00128 
00129 
00130 // Convert float3 colors to uchar4, performing the necessary bias, scaling, 
00131 // and range clamping so we don't encounter integer wraparound, etc.
00132 inline __device__ void convert_color(uchar4 & cu, float3 cf, float invisovalue) {
00133   // conversion to GLubyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
00134   // c = f * (2^8-1)
00135 
00136 #if 1
00137   float3 cftmp = cf * invisovalue;
00138   cu = make_uchar4(fminf(cftmp.x * 255.0f, 255.0f),
00139                    fminf(cftmp.y * 255.0f, 255.0f),
00140                    fminf(cftmp.z * 255.0f, 255.0f),
00141                    255); 
00142 #else
00143   // scale color values to prevent overflow, 
00144   // and convert to fixed-point representation all at once
00145   float invmaxcolscale = __frcp_rn(fmaxf(fmaxf(fmaxf(cf.x, cf.y), cf.z), 1.0f)) * 255.0f;
00146 
00147   // clamp color values to prevent integer wraparound
00148   cu = make_uchar4(cf.x * invmaxcolscale,
00149                    cf.y * invmaxcolscale,
00150                    cf.z * invmaxcolscale,
00151                    255);
00152 #endif
00153 }
00154 
00155 
00156 // convert uchar4 colors to float3
00157 inline __device__ void convert_color(float3 & cf, uchar4 cu) {
00158   const float i2f = 1.0f / 255.0f;
00159   cf.x = cu.x * i2f;
00160   cf.y = cu.y * i2f;
00161   cf.z = cu.z * i2f;
00162 }
00163 
00164 
00165 //
00166 // Restrict macro to make it easy to do perf tuning tests
00167 //
00168 #if 1 && (CUDART_VERSION >= 4000)
00169 #define RESTRICT __restrict__
00170 #else
00171 #define RESTRICT
00172 #endif
00173 
00174 // 
00175 // Parameters for linear-time range-limited gaussian density kernels
00176 //
00177 #define GGRIDSZ   8.0f
00178 #define GBLOCKSZX 8
00179 #define GBLOCKSZY 8
00180 
00181 #if 1
00182 #define GTEXBLOCKSZZ 2
00183 #define GTEXUNROLL   4
00184 #define GBLOCKSZZ    2
00185 #define GUNROLL      4
00186 #else
00187 #define GTEXBLOCKSZZ 8
00188 #define GTEXUNROLL   1
00189 #define GBLOCKSZZ    8
00190 #define GUNROLL      1
00191 #endif
00192 
00193 #define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
00194 #if __CUDA_ARCH__ >= 600
00195 #define MINBLOCKDENS 16
00196 #elif __CUDA_ARCH__ >= 300
00197 #define MINBLOCKDENS 16
00198 #elif __CUDA_ARCH__ >= 200
00199 #define MINBLOCKDENS 1
00200 #else
00201 #define MINBLOCKDENS 1
00202 #endif
00203 
00204 
00205 //
00206 // Templated version of the density map kernel to handle multiple 
00207 // data formats for the output density volume and volumetric texture.
00208 // This variant of the density map algorithm normalizes densities so
00209 // that the target isovalue is a density of 1.0.
00210 //
00211 template<class DENSITY, class VOLTEX>
00212 __global__ static void 
00213 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00214 gaussdensity_fast_tex_norm(int natoms,
00215                       const float4 * RESTRICT sorted_xyzr, 
00216                       const float4 * RESTRICT sorted_color, 
00217                       int3 volsz,
00218                       int3 acncells,
00219                       float acgridspacing,
00220                       float invacgridspacing,
00221                       const uint2 * RESTRICT cellStartEnd,
00222                       float gridspacing, unsigned int z, 
00223                       DENSITY * RESTRICT densitygrid,
00224                       VOLTEX * RESTRICT voltexmap,
00225                       float invisovalue) {
00226   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00227   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00228   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00229 
00230   // shave register use slightly
00231   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00232                          yindex * volsz.x + xindex;
00233 
00234   // early exit if this thread is outside of the grid bounds
00235   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00236     return;
00237 
00238   zindex += z;
00239 
00240   // compute ac grid index of lower corner minus gaussian radius
00241   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00242   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00243   int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00244 
00245   // compute ac grid index of upper corner plus gaussian radius
00246   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00247   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00248   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00249 
00250   xabmin = (xabmin < 0) ? 0 : xabmin;
00251   yabmin = (yabmin < 0) ? 0 : yabmin;
00252   zabmin = (zabmin < 0) ? 0 : zabmin;
00253   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00254   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00255   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00256 
00257   float coorx = gridspacing * xindex;
00258   float coory = gridspacing * yindex;
00259   float coorz = gridspacing * zindex;
00260 
00261   float densityval1=0.0f;
00262   float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00263 #if GTEXUNROLL >= 2
00264   float densityval2=0.0f;
00265   float3 densitycol2=densitycol1;
00266 #endif
00267 #if GTEXUNROLL >= 4
00268   float densityval3=0.0f;
00269   float3 densitycol3=densitycol1;
00270   float densityval4=0.0f;
00271   float3 densitycol4=densitycol1;
00272 #endif
00273 
00274   int acplanesz = acncells.x * acncells.y;
00275   int xab, yab, zab;
00276   for (zab=zabmin; zab<=zabmax; zab++) {
00277     for (yab=yabmin; yab<=yabmax; yab++) {
00278       for (xab=xabmin; xab<=xabmax; xab++) {
00279         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00280         // this biggest latency hotspot in the kernel, if we could improve
00281         // packing of the grid cell map, we'd likely improve performance 
00282         uint2 atomstartend = cellStartEnd[abcellidx];
00283         if (atomstartend.x != GRID_CELL_EMPTY) {
00284           unsigned int atomid;
00285           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00286             float4 atom  = sorted_xyzr[atomid];
00287             float4 color = sorted_color[atomid];
00288             float dx = coorx - atom.x;
00289             float dy = coory - atom.y;
00290             float dxy2 = dx*dx + dy*dy;
00291 
00292             float dz = coorz - atom.z;
00293             float r21 = (dxy2 + dz*dz) * atom.w;
00294             float tmp1 = exp2f(r21);
00295             densityval1 += tmp1;
00296             densitycol1.x += tmp1 * color.x;
00297             densitycol1.y += tmp1 * color.y;
00298             densitycol1.z += tmp1 * color.z;
00299 
00300 #if GTEXUNROLL >= 2
00301             float dz2 = dz + gridspacing;
00302             float r22 = (dxy2 + dz2*dz2) * atom.w;
00303             float tmp2 = exp2f(r22);
00304             densityval2 += tmp2;
00305             densitycol2.x += tmp2 * color.x;
00306             densitycol2.y += tmp2 * color.y;
00307             densitycol2.z += tmp2 * color.z;
00308 #endif
00309 #if GTEXUNROLL >= 4
00310             float dz3 = dz2 + gridspacing;
00311             float r23 = (dxy2 + dz3*dz3) * atom.w;
00312             float tmp3 = exp2f(r23);
00313             densityval3 += tmp3;
00314             densitycol3.x += tmp3 * color.x;
00315             densitycol3.y += tmp3 * color.y;
00316             densitycol3.z += tmp3 * color.z;
00317 
00318             float dz4 = dz3 + gridspacing;
00319             float r24 = (dxy2 + dz4*dz4) * atom.w;
00320             float tmp4 = exp2f(r24);
00321             densityval4 += tmp4;
00322             densitycol4.x += tmp4 * color.x;
00323             densitycol4.y += tmp4 * color.y;
00324             densitycol4.z += tmp4 * color.z;
00325 #endif
00326           }
00327         }
00328       }
00329     }
00330   }
00331 
00332   DENSITY densityout;
00333   VOLTEX texout;
00334   convert_density(densityout, densityval1 * invisovalue);
00335   densitygrid[outaddr          ] = densityout;
00336   convert_color(texout, densitycol1, invisovalue);
00337   voltexmap[outaddr          ] = texout;
00338 
00339 #if GTEXUNROLL >= 2
00340   int planesz = volsz.x * volsz.y;
00341   convert_density(densityout, densityval2 * invisovalue);
00342   densitygrid[outaddr + planesz] = densityout;
00343   convert_color(texout, densitycol2, invisovalue);
00344   voltexmap[outaddr + planesz] = texout;
00345 #endif
00346 #if GTEXUNROLL >= 4
00347   convert_density(densityout, densityval3 * invisovalue);
00348   densitygrid[outaddr + 2*planesz] = densityout;
00349   convert_color(texout, densitycol3, invisovalue);
00350   voltexmap[outaddr + 2*planesz] = texout;
00351 
00352   convert_density(densityout, densityval4 * invisovalue);
00353   densitygrid[outaddr + 3*planesz] = densityout;
00354   convert_color(texout, densitycol4, invisovalue);
00355   voltexmap[outaddr + 3*planesz] = texout;
00356 #endif
00357 }
00358 
00359 
00360 __global__ static void 
00361 __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00362 gaussdensity_fast_tex3f(int natoms,
00363                         const float4 * RESTRICT sorted_xyzr, 
00364                         const float4 * RESTRICT sorted_color, 
00365                         int3 volsz,
00366                         int3 acncells,
00367                         float acgridspacing,
00368                         float invacgridspacing,
00369                         const uint2 * RESTRICT cellStartEnd,
00370                         float gridspacing, unsigned int z, 
00371                         float * RESTRICT densitygrid,
00372                         float3 * RESTRICT voltexmap,
00373                         float invisovalue) {
00374   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00375   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00376   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00377 
00378   // shave register use slightly
00379   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00380                          yindex * volsz.x + xindex;
00381 
00382   // early exit if this thread is outside of the grid bounds
00383   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00384     return;
00385 
00386   zindex += z;
00387 
00388   // compute ac grid index of lower corner minus gaussian radius
00389   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00390   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00391   int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00392 
00393   // compute ac grid index of upper corner plus gaussian radius
00394   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00395   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00396   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00397 
00398   xabmin = (xabmin < 0) ? 0 : xabmin;
00399   yabmin = (yabmin < 0) ? 0 : yabmin;
00400   zabmin = (zabmin < 0) ? 0 : zabmin;
00401   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00402   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00403   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00404 
00405   float coorx = gridspacing * xindex;
00406   float coory = gridspacing * yindex;
00407   float coorz = gridspacing * zindex;
00408 
00409   float densityval1=0.0f;
00410   float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00411 #if GTEXUNROLL >= 2
00412   float densityval2=0.0f;
00413   float3 densitycol2=densitycol1;
00414 #endif
00415 #if GTEXUNROLL >= 4
00416   float densityval3=0.0f;
00417   float3 densitycol3=densitycol1;
00418   float densityval4=0.0f;
00419   float3 densitycol4=densitycol1;
00420 #endif
00421 
00422   int acplanesz = acncells.x * acncells.y;
00423   int xab, yab, zab;
00424   for (zab=zabmin; zab<=zabmax; zab++) {
00425     for (yab=yabmin; yab<=yabmax; yab++) {
00426       for (xab=xabmin; xab<=xabmax; xab++) {
00427         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00428         // this biggest latency hotspot in the kernel, if we could improve
00429         // packing of the grid cell map, we'd likely improve performance 
00430         uint2 atomstartend = cellStartEnd[abcellidx];
00431         if (atomstartend.x != GRID_CELL_EMPTY) {
00432           unsigned int atomid;
00433           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00434             float4 atom  = sorted_xyzr[atomid];
00435             float3 color = make_float3(sorted_color[atomid]);
00436             float dx = coorx - atom.x;
00437             float dy = coory - atom.y;
00438             float dxy2 = dx*dx + dy*dy;
00439 
00440             float dz = coorz - atom.z;
00441             float r21 = (dxy2 + dz*dz) * atom.w;
00442             float tmp1 = exp2f(r21);
00443             densityval1 += tmp1;
00444             densitycol1 += color * tmp1;
00445 
00446 #if GTEXUNROLL >= 2
00447             float dz2 = dz + gridspacing;
00448             float r22 = (dxy2 + dz2*dz2) * atom.w;
00449             float tmp2 = exp2f(r22);
00450             densityval2 += tmp2;
00451             densitycol2 += color * tmp2;
00452 #endif
00453 #if GTEXUNROLL >= 4
00454             float dz3 = dz2 + gridspacing;
00455             float r23 = (dxy2 + dz3*dz3) * atom.w;
00456             float tmp3 = exp2f(r23);
00457             densityval3 += tmp3;
00458             densitycol3 += color * tmp3;
00459 
00460             float dz4 = dz3 + gridspacing;
00461             float r24 = (dxy2 + dz4*dz4) * atom.w;
00462             float tmp4 = exp2f(r24);
00463             densityval4 += tmp4;
00464             densitycol4 += color * tmp4;
00465 #endif
00466           }
00467         }
00468       }
00469     }
00470   }
00471 
00472   float3 texout;
00473   densitygrid[outaddr            ] = densityval1;
00474   convert_color(texout, densitycol1, invisovalue);
00475   voltexmap[outaddr            ] = texout;
00476 
00477 #if GTEXUNROLL >= 2
00478   int planesz = volsz.x * volsz.y;
00479   densitygrid[outaddr +   planesz] = densityval2;
00480   convert_color(texout, densitycol2, invisovalue);
00481   voltexmap[outaddr +   planesz] = texout;
00482 #endif
00483 #if GTEXUNROLL >= 4
00484   densitygrid[outaddr + 2*planesz] = densityval3;
00485   convert_color(texout, densitycol3, invisovalue);
00486   voltexmap[outaddr + 2*planesz] = texout;
00487 
00488   densitygrid[outaddr + 3*planesz] = densityval4;
00489   convert_color(texout, densitycol4, invisovalue);
00490   voltexmap[outaddr + 3*planesz] = texout;
00491 #endif
00492 }
00493 
00494 
00495 __global__ static void 
00496 // __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
00497 gaussdensity_fast(int natoms,
00498                   const float4 * RESTRICT sorted_xyzr, 
00499                   int3 volsz,
00500                   int3 acncells,
00501                   float acgridspacing,
00502                   float invacgridspacing,
00503                   const uint2 * RESTRICT cellStartEnd,
00504                   float gridspacing, unsigned int z, 
00505                   float * RESTRICT densitygrid) {
00506   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00507   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00508   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00509   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00510                          yindex * volsz.x + 
00511                          xindex;
00512 
00513   // early exit if this thread is outside of the grid bounds
00514   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00515     return;
00516 
00517   zindex += z;
00518 
00519   // compute ac grid index of lower corner minus gaussian radius
00520   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00521   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00522   int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00523 
00524   // compute ac grid index of upper corner plus gaussian radius
00525   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00526   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00527   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00528 
00529   xabmin = (xabmin < 0) ? 0 : xabmin;
00530   yabmin = (yabmin < 0) ? 0 : yabmin;
00531   zabmin = (zabmin < 0) ? 0 : zabmin;
00532   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00533   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00534   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00535 
00536   float coorx = gridspacing * xindex;
00537   float coory = gridspacing * yindex;
00538   float coorz = gridspacing * zindex;
00539 
00540   float densityval1=0.0f;
00541 #if GUNROLL >= 2
00542   float densityval2=0.0f;
00543 #endif
00544 #if GUNROLL >= 4
00545   float densityval3=0.0f;
00546   float densityval4=0.0f;
00547 #endif
00548 
00549   int acplanesz = acncells.x * acncells.y;
00550   int xab, yab, zab;
00551   for (zab=zabmin; zab<=zabmax; zab++) {
00552     for (yab=yabmin; yab<=yabmax; yab++) {
00553       for (xab=xabmin; xab<=xabmax; xab++) {
00554         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00555         uint2 atomstartend = cellStartEnd[abcellidx];
00556         if (atomstartend.x != GRID_CELL_EMPTY) {
00557           unsigned int atomid;
00558           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00559             float4 atom = sorted_xyzr[atomid];
00560             float dx = coorx - atom.x;
00561             float dy = coory - atom.y;
00562             float dxy2 = dx*dx + dy*dy;
00563   
00564             float dz = coorz - atom.z;
00565             float r21 = (dxy2 + dz*dz) * atom.w;
00566             densityval1 += exp2f(r21);
00567 
00568 #if GUNROLL >= 2
00569             float dz2 = dz + gridspacing;
00570             float r22 = (dxy2 + dz2*dz2) * atom.w;
00571             densityval2 += exp2f(r22);
00572 #endif
00573 #if GUNROLL >= 4
00574             float dz3 = dz2 + gridspacing;
00575             float r23 = (dxy2 + dz3*dz3) * atom.w;
00576             densityval3 += exp2f(r23);
00577 
00578             float dz4 = dz3 + gridspacing;
00579             float r24 = (dxy2 + dz4*dz4) * atom.w;
00580             densityval4 += exp2f(r24);
00581 #endif
00582           }
00583         }
00584       }
00585     }
00586   }
00587 
00588   densitygrid[outaddr            ] = densityval1;
00589 #if GUNROLL >= 2
00590   int planesz = volsz.x * volsz.y;
00591   densitygrid[outaddr +   planesz] = densityval2;
00592 #endif
00593 #if GUNROLL >= 4
00594   densitygrid[outaddr + 2*planesz] = densityval3;
00595   densitygrid[outaddr + 3*planesz] = densityval4;
00596 #endif
00597 }
00598 
00599 
00600 // per-GPU handle with various memory buffer pointers, etc.
00601 typedef struct {
00602   cudaDeviceProp deviceProp; 
00603   int dev;                   
00604   int verbose;               
00605 
00607   long int natoms;           
00608   int colorperatom;          
00609   CUDAQuickSurf::VolTexFormat vtexformat;   
00610   int acx;                   
00611   int acy;                   
00612   int acz;                   
00613   int gx;                    
00614   int gy;                    
00615   int gz;                    
00616 
00617   CUDAMarchingCubes *mc;     
00618 
00619   float *devdensity;         
00620   void *devvoltexmap;        
00621   float4 *xyzr_d;            
00622   float4 *sorted_xyzr_d;     
00623   float4 *color_d;           
00624   float4 *sorted_color_d;    
00625 
00626   unsigned int *atomIndex_d; 
00627   unsigned int *atomHash_d;  
00628   uint2 *cellStartEnd_d;     
00629 
00630   void *safety;              
00631 
00632   float3 *v3f_d;             
00633   float3 *n3f_d;             
00634   float3 *c3f_d;             
00635   char3 *n3b_d;              
00636   uchar4 *c4u_d;             
00637 } qsurf_gpuhandle;
00638 
00639 
00640 CUDAQuickSurf::CUDAQuickSurf() {
00641   voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
00642   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00643 
00644   if (getenv("VMDQUICKSURFVERBOSE") != NULL) {
00645     gpuh->verbose = 1;
00646     int tmp = atoi(getenv("VMDQUICKSURFVERBOSE"));
00647     if (tmp > 0)
00648       gpuh->verbose = tmp;
00649   }
00650 
00651   gpuh->vtexformat = RGB3F; // large size allocs by default
00652 
00653   if (cudaGetDevice(&gpuh->dev) != cudaSuccess) {
00654     gpuh->dev = -1; // flag GPU as unusable
00655   }
00656 
00657   if (cudaGetDeviceProperties(&gpuh->deviceProp, gpuh->dev) != cudaSuccess) {
00658     cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
00659     gpuh->dev = -1; // flag GPU as unusable
00660   }
00661 }
00662 
00663 
00664 CUDAQuickSurf::~CUDAQuickSurf() {
00665   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00666 
00667   // free all working buffers if not done already
00668   free_bufs();
00669 
00670   // delete marching cubes object
00671   delete gpuh->mc;
00672 
00673   free(voidgpu);
00674 }
00675 
00676 
00677 int CUDAQuickSurf::free_bufs() {
00678   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00679 
00680   // zero out max buffer capacities
00681   gpuh->natoms = 0;
00682   gpuh->colorperatom = 0;
00683   gpuh->acx = 0;
00684   gpuh->acy = 0;
00685   gpuh->acz = 0;
00686   gpuh->gx = 0;
00687   gpuh->gy = 0;
00688   gpuh->gz = 0;
00689 
00690   if (gpuh->safety != NULL)
00691     cudaFree(gpuh->safety);
00692   gpuh->safety=NULL;
00693 
00694   if (gpuh->devdensity != NULL)
00695     cudaFree(gpuh->devdensity);
00696   gpuh->devdensity=NULL;
00697 
00698   if (gpuh->devvoltexmap != NULL)
00699     cudaFree(gpuh->devvoltexmap);
00700   gpuh->devvoltexmap=NULL;
00701 
00702   if (gpuh->xyzr_d != NULL)
00703     cudaFree(gpuh->xyzr_d);
00704   gpuh->xyzr_d=NULL;
00705 
00706   if (gpuh->sorted_xyzr_d != NULL)
00707     cudaFree(gpuh->sorted_xyzr_d);  
00708   gpuh->sorted_xyzr_d=NULL;
00709 
00710   if (gpuh->color_d != NULL)
00711     cudaFree(gpuh->color_d);
00712   gpuh->color_d=NULL;
00713 
00714   if (gpuh->sorted_color_d != NULL)
00715     cudaFree(gpuh->sorted_color_d);
00716   gpuh->sorted_color_d=NULL;
00717 
00718   if (gpuh->atomIndex_d != NULL)
00719     cudaFree(gpuh->atomIndex_d);
00720   gpuh->atomIndex_d=NULL;
00721 
00722   if (gpuh->atomHash_d != NULL)
00723     cudaFree(gpuh->atomHash_d);
00724   gpuh->atomHash_d=NULL;
00725 
00726   if (gpuh->cellStartEnd_d != NULL)
00727     cudaFree(gpuh->cellStartEnd_d);
00728   gpuh->cellStartEnd_d=NULL;
00729 
00730   if (gpuh->v3f_d != NULL)
00731     cudaFree(gpuh->v3f_d);
00732   gpuh->v3f_d=NULL;
00733 
00734   if (gpuh->n3f_d != NULL)
00735     cudaFree(gpuh->n3f_d);
00736   gpuh->n3f_d=NULL;
00737 
00738   if (gpuh->c3f_d != NULL)
00739     cudaFree(gpuh->c3f_d);
00740   gpuh->c3f_d=NULL;
00741 
00742   if (gpuh->n3b_d != NULL)
00743     cudaFree(gpuh->n3b_d);
00744   gpuh->n3b_d=NULL;
00745 
00746   if (gpuh->c4u_d != NULL)
00747     cudaFree(gpuh->c4u_d);
00748   gpuh->c4u_d=NULL;
00749 
00750 
00751   return 0;
00752 }
00753 
00754 
00755 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom, 
00756                               VolTexFormat vtexformat, 
00757                               int acx, int acy, int acz,
00758                               int gx, int gy, int gz) {
00759   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00760 
00761   // If the current atom count, texturing mode, and total voxel count
00762   // use the same or less storage than the size of the existing buffers,
00763   // we can reuse the same buffers without having to go through the 
00764   // complex allocation and validation loops.  This is a big performance
00765   // benefit during trajectory animation.
00766   if (natoms <= gpuh->natoms &&
00767       colorperatom <= gpuh->colorperatom &&
00768       vtexformat == gpuh->vtexformat &&
00769       (acx*acy*acz) <= (gpuh->acx * gpuh->acy * gpuh->acz) && 
00770       (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
00771     return 0;
00772  
00773   return -1; // no existing bufs, or too small to be used 
00774 }
00775 
00776 
00777 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom, 
00778                               VolTexFormat vtexformat, 
00779                               int acx, int acy, int acz,
00780                               int gx, int gy, int gz) {
00781   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00782 
00783   // early exit from allocation call if we've already got existing
00784   // buffers that are large enough to support the request
00785   if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, gx, gy, gz) == 0)
00786     return 0;
00787 
00788   // If we have any existing allocations, trash them as they weren't
00789   // usable for this new request and we need to reallocate them from scratch
00790   free_bufs();
00791 
00792   long int acncells = ((long) acx) * ((long) acy) * ((long) acz);
00793   long int ncells = ((long) gx) * ((long) gy) * ((long) gz);
00794   long int volmemsz = ncells * sizeof(float);
00795   long int chunkmaxverts = 3L * ncells; // assume worst case 50% triangle occupancy
00796   long int MCsz = CUDAMarchingCubes::MemUsageMC(gx, gy, gz);
00797 
00798   // Allocate all of the memory buffers our algorithms will need up-front,
00799   // so we can retry and gracefully reduce the sizes of various buffers
00800   // to attempt to fit within available GPU memory 
00801   long int totalmemsz = 
00802     volmemsz +                                       // volume
00803     (2L * natoms * sizeof(unsigned int)) +           // bin sort
00804     (acncells * sizeof(uint2)) +                     // bin sort
00805     (3L * chunkmaxverts * sizeof(float3)) +          // MC vertex bufs 
00806     natoms*sizeof(float4) +                          // thrust
00807     8L * gx * gy * sizeof(float) +                   // thrust
00808     MCsz;                                            // mcubes
00809 
00810   cudaMalloc((void**)&gpuh->devdensity, volmemsz);
00811   if (colorperatom) {
00812     int voltexsz = 0;
00813     switch (vtexformat) {
00814       case RGB3F:
00815         voltexsz = ncells * sizeof(float3);
00816         break;
00817 
00818       case RGB4U:
00819         voltexsz = ncells * sizeof(uchar4);
00820         break;
00821     }
00822     gpuh->vtexformat = vtexformat;
00823 
00824     cudaMalloc((void**)&gpuh->devvoltexmap, voltexsz);
00825     cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
00826     cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
00827     totalmemsz += 2 * natoms * sizeof(float4);
00828   }
00829   cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
00830   cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
00831   cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
00832   cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
00833   cudaMalloc((void**)&gpuh->cellStartEnd_d, acncells * sizeof(uint2));
00834 
00835   // allocate marching cubes output buffers
00836   cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
00837 #if 1
00838   // Note: The CUDA char3 type is explicitly a signed type
00839   cudaMalloc((void**)&gpuh->n3b_d, 3 * chunkmaxverts * sizeof(char3));
00840   totalmemsz += 3 * chunkmaxverts * sizeof(char3);   // MC normal bufs 
00841 #else
00842   cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
00843   totalmemsz += 3 * chunkmaxverts * sizeof(float3);  // MC normal bufs 
00844 #endif
00845 #if 1
00846   cudaMalloc((void**)&gpuh->c4u_d, 3 * chunkmaxverts * sizeof(uchar4));
00847   totalmemsz += 3 * chunkmaxverts * sizeof(uchar4);  // MC vertex color bufs 
00848 #else
00849   cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
00850   totalmemsz += 3 * chunkmaxverts * sizeof(float3);  // MC vertex color bufs 
00851 #endif
00852 
00853   // Allocate an extra phantom array to act as a safety net to
00854   // ensure that subsequent allocations performed internally by 
00855   // the NVIDIA thrust template library or by our 
00856   // marching cubes implementation don't fail, since we can't 
00857   // currently pre-allocate all of them.
00858   cudaMalloc(&gpuh->safety, 
00859              natoms*sizeof(float4) +                          // thrust
00860              8 * gx * gy * sizeof(float) +                    // thrust
00861              MCsz);                                           // mcubes
00862 
00863   if (gpuh->verbose > 1)
00864     printf("Total QuickSurf mem size: %ld MB\n", totalmemsz / (1024*1024));
00865 
00866   cudaError_t err = cudaGetLastError();
00867   if (err != cudaSuccess)
00868     return -1;
00869 
00870   // once the allocation has succeeded, we update the GPU handle info 
00871   // so that the next test/allocation pass knows the latest state.
00872   gpuh->natoms = natoms;
00873   gpuh->colorperatom = colorperatom;
00874 
00875   gpuh->acx = acx;
00876   gpuh->acy = acy;
00877   gpuh->acz = acz;
00878 
00879   gpuh->gx = gx;
00880   gpuh->gy = gy;
00881   gpuh->gz = gz;
00882 
00883   return 0;
00884 }
00885 
00886 
00887 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
00888                                   long int natoms, int colorperatom, 
00889                                   VolTexFormat vtexformat,
00890                                   int acx, int acy, int acz,
00891                                   int gx, int gy, int gz,
00892                                   int &cx, int &cy, int &cz,
00893                                   int &sx, int &sy, int &sz) {
00894   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
00895   if (colorperatom)
00896     Bsz.z = GTEXBLOCKSZZ;
00897 
00898   cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
00899 
00900   // enter loop to attempt a single-pass computation, but if the
00901   // allocation fails, cut the chunk size Z dimension by half
00902   // repeatedly until we either run chunks of 8 planes at a time,
00903   // otherwise we assume it is hopeless.
00904   cz <<= 1; // premultiply by two to simplify loop body
00905   int chunkiters = 0;
00906   int chunkallocated = 0;
00907   while (!chunkallocated) {
00908     // Cut the Z chunk size in half
00909     chunkiters++;
00910     cz >>= 1;
00911 
00912     // if we've already dropped to a subvolume size, subtract off the
00913     // four extra Z planes from last time before we do the modulo padding
00914     // calculation so we don't hit an infinite loop trying to go below 
00915     // 16 planes due the padding math below.
00916     if (cz != gz)
00917       cz-=4;
00918 
00919     // Pad the chunk to a multiple of the computational tile size since
00920     // each thread computes multiple elements (unrolled in the Z direction)
00921     cz += (8 - (cz % 8));
00922 
00923     // The density map "slab" size is the chunk size but without the extra
00924     // plane used to copy the last plane of the previous chunk's density
00925     // into the start, for use by the marching cubes.
00926     sx = cx;
00927     sy = cy;
00928     sz = cz;
00929 
00930     // Add four extra Z-planes for copying the previous end planes into 
00931     // the start of the next chunk.
00932     cz+=4;
00933 
00934 #if 0
00935     printf("  Trying slab size: %d (test: %d)\n", sz, testexisting);
00936 #endif
00937 
00938 #if 1
00939     // test to see if total number of thread blocks exceeds maximum
00940     // number we can reasonably run prior to a kernel timeout error
00941     dim3 tGsz((sx+Bsz.x-1) / Bsz.x, 
00942               (sy+Bsz.y-1) / Bsz.y,
00943               (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
00944     if (colorperatom) {
00945       tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
00946     }
00947     if (tGsz.x * tGsz.y * tGsz.z > 65535)
00948       continue; 
00949 #endif
00950 
00951     // Bail out if we can't get enough memory to run at least
00952     // 8 slices in a single pass (making sure we've freed any allocations
00953     // beforehand, so they aren't leaked).
00954     if (sz <= 8) {
00955       return -1;
00956     }
00957  
00958     if (testexisting) {
00959       if (check_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00960         continue;
00961     } else {
00962       if (alloc_bufs(natoms, colorperatom, vtexformat, acx, acy, acz, cx, cy, cz) != 0)
00963         continue;
00964     }
00965 
00966     chunkallocated=1;
00967   }
00968 
00969   return 0;
00970 }
00971 
00972 
00973 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f, 
00974                              const float *colors_f,
00975                              int colorperatom, VolTexFormat voltexformat,
00976                              float *origin, int *numvoxels, float maxrad,
00977                              float radscale, float gridspacing, 
00978                              float isovalue, float gausslim,
00979                              VMDDisplayList *cmdList) {
00980   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
00981 
00982   // if there were any problems when the constructor tried to get
00983   // GPU hardware capabilities, we consider it to be unusable for all time.
00984   if (gpuh->dev < 0)
00985     return -1;
00986 
00987   // This code currently requires compute capability 2.x or greater.
00988   // We absolutely depend on hardware broadcasts for 
00989   // global memory reads by multiple threads reading the same element,
00990   // and the code more generally assumes the Fermi L1 cache and prefers
00991   // to launch 3-D grids where possible. 
00992   if (gpuh->deviceProp.major < 2) {
00993     return -1;
00994   }
00995 
00996   // start timing...
00997   wkf_timerhandle globaltimer = wkf_timer_create();
00998   wkf_timer_start(globaltimer);
00999 
01000   int vtexsize = 0;
01001 //  const VolTexFormat voltexformat = RGB4U; // XXX caller may want to set this
01002 //  const VolTexFormat voltexformat = RGB3F; // XXX caller may want to set this
01003   switch (voltexformat) {
01004     case RGB3F: 
01005       vtexsize = sizeof(float3);
01006       break;
01007 
01008     case RGB4U: 
01009       vtexsize = sizeof(uchar4);
01010       break;
01011   }
01012 
01013   float4 *colors = (float4 *) colors_f;
01014   int chunkmaxverts=0;
01015   int chunknumverts=0; 
01016   int numverts=0;
01017   int numfacets=0;
01018 
01019   // compute grid spacing for the acceleration grid
01020   float acgridspacing = gausslim * radscale * maxrad;
01021 
01022   // ensure acceleration grid spacing >= density grid spacing
01023   if (acgridspacing < gridspacing)
01024     acgridspacing = gridspacing;
01025 
01026   // Allocate output arrays for the gaussian density map and 3-D texture map
01027   // We test for errors carefully here since this is the most likely place
01028   // for a memory allocation failure due to the size of the grid.
01029   int3 volsz = make_int3(numvoxels[0], numvoxels[1], numvoxels[2]);
01030   int3 chunksz = volsz;
01031   int3 slabsz = volsz;
01032 
01033   // An alternative scheme to minimize the QuickSurf GPU memory footprint
01034   if (getenv("VMDQUICKSURFMINMEM")) {
01035     if (volsz.z > 32) {
01036       slabsz.z = chunksz.z = 16;
01037     }
01038   }
01039 
01040   int3 accelcells;
01041   accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01042   accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01043   accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01044 
01045   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01046   if (colorperatom)
01047     Bsz.z = GTEXBLOCKSZZ;
01048 
01049   // check to see if it's possible to use an existing allocation,
01050   // if so, just leave things as they are, and do the computation 
01051   // using the existing buffers
01052   if (gpuh->natoms == 0 ||
01053       get_chunk_bufs(1, natoms, colorperatom, voltexformat,
01054                      accelcells.x, accelcells.y, accelcells.z,
01055                      volsz.x, volsz.y, volsz.z,
01056                      chunksz.x, chunksz.y, chunksz.z,
01057                      slabsz.x, slabsz.y, slabsz.z) == -1) {
01058     // reset the chunksz and slabsz after failing to try and
01059     // fit them into the existing allocations...
01060     chunksz = volsz;
01061     slabsz = volsz;
01062 
01063     // reallocate the chunk buffers from scratch since we weren't
01064     // able to reuse them
01065     if (get_chunk_bufs(0, natoms, colorperatom, voltexformat,
01066                        accelcells.x, accelcells.y, accelcells.z,
01067                        volsz.x, volsz.y, volsz.z,
01068                        chunksz.x, chunksz.y, chunksz.z,
01069                        slabsz.x, slabsz.y, slabsz.z) == -1) {
01070       wkf_timer_destroy(globaltimer);
01071       return -1;
01072     }
01073   }
01074   chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01075 
01076   // Free the "safety padding" memory we allocate to ensure we dont
01077   // have trouble with thrust calls that allocate their own memory later
01078   if (gpuh->safety != NULL)
01079     cudaFree(gpuh->safety);
01080   gpuh->safety = NULL;
01081 
01082   if (gpuh->verbose > 1) {
01083     printf("  Using GPU chunk size: %d\n", chunksz.z);
01084     printf("  Accel grid(%d, %d, %d) spacing %f\n",
01085            accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01086   }
01087 
01088   // pre-process the atom coordinates and radii as needed
01089   // short-term fix until a new CUDA kernel takes care of this
01090   int i, i4;
01091   float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01092   float log2e = log2f(2.718281828f);
01093   for (i=0,i4=0; i<natoms; i++,i4+=4) {
01094     xyzr[i].x = xyzr_f[i4    ];
01095     xyzr[i].y = xyzr_f[i4 + 1];
01096     xyzr[i].z = xyzr_f[i4 + 2];
01097 
01098     float scaledrad = xyzr_f[i4 + 3] * radscale;
01099     float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01100 
01101     xyzr[i].w = arinv;
01102   }
01103   cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01104   free(xyzr);
01105 
01106   if (colorperatom)
01107     cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01108  
01109   // build uniform grid acceleration structure
01110   if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01111                                        gpuh->sorted_xyzr_d,
01112                                        gpuh->sorted_color_d,
01113                                        gpuh->atomIndex_d, gpuh->atomHash_d,
01114                                        gpuh->cellStartEnd_d, 
01115                                        accelcells, 1.0f / acgridspacing) != 0) {
01116     wkf_timer_destroy(globaltimer);
01117     free_bufs();
01118     return -1;
01119   }
01120 
01121   double sorttime = wkf_timer_timenow(globaltimer);
01122   double lastlooptime = sorttime;
01123 
01124   double densitykerneltime = 0.0f;
01125   double densitytime = 0.0f;
01126   double mckerneltime = 0.0f;
01127   double mctime = 0.0f; 
01128   double copycalltime = 0.0f;
01129   double copytime = 0.0f;
01130 
01131   float *volslab_d = NULL;
01132   void *texslab_d = NULL;
01133 
01134   int lzplane = GBLOCKSZZ * GUNROLL;
01135   if (colorperatom)
01136     lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01137 
01138   // initialize CUDA marching cubes class instance or rebuild it if needed
01139   uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01140   if (gpuh->mc == NULL) {
01141     gpuh->mc = new CUDAMarchingCubes(); 
01142     if (!gpuh->mc->Initialize(mgsz)) {
01143       printf("QuickSurf call to MC Initialize() failed\n");
01144     }
01145   } else {
01146     uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01147     if (slabsz.x > mcmaxgridsize.x ||
01148         slabsz.y > mcmaxgridsize.y ||
01149         slabsz.z > mcmaxgridsize.z) {
01150       if (gpuh->verbose)
01151         printf("*** QuickSurf Allocating new MC object...\n");
01152  
01153       // delete marching cubes object
01154       delete gpuh->mc;
01155 
01156       // create and initialize CUDA marching cubes class instance
01157       gpuh->mc = new CUDAMarchingCubes(); 
01158 
01159       if (!gpuh->mc->Initialize(mgsz)) {
01160         printf("QuickSurf MC Initialize() call failed to recreate MC object\n");
01161       }
01162     } 
01163   }
01164 
01165   int z;
01166   int chunkcount=0;
01167   float invacgridspacing = 1.0f / acgridspacing;
01168   float invisovalue = 1.0f / isovalue;
01169   for (z=0; z<volsz.z; z+=slabsz.z) {
01170     int3 curslab = slabsz;
01171     if (z+curslab.z > volsz.z)
01172       curslab.z = volsz.z - z; 
01173   
01174     int slabplanesz = curslab.x * curslab.y;
01175 
01176     dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x, 
01177              (curslab.y+Bsz.y-1) / Bsz.y,
01178              (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01179     if (colorperatom)
01180       Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01181 
01182     // For SM >= 2.x, we can run the entire slab in one pass by launching
01183     // a 3-D grid of thread blocks.
01184     dim3 Gszslice = Gsz;
01185 
01186     if (gpuh->verbose > 1) {
01187       printf("CUDA device %d, grid size %dx%dx%d\n", 
01188              0, Gsz.x, Gsz.y, Gsz.z);
01189       printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01190              curslab.x, curslab.y, curslab.z,
01191              accelcells.x, accelcells.y, accelcells.z);
01192       printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01193     }
01194 
01195     // For all but the first density slab, we copy the last four 
01196     // planes of the previous run into the start of the next run so
01197     // that we can extract the isosurface with no discontinuities
01198     if (z == 0) {
01199       volslab_d = gpuh->devdensity;
01200       if (colorperatom)
01201         texslab_d = gpuh->devvoltexmap;
01202     } else {
01203       int fourplanes = 4 * slabplanesz;
01204       cudaMemcpy(gpuh->devdensity,
01205                  volslab_d + (slabsz.z-4) * slabplanesz, 
01206                  fourplanes * sizeof(float), cudaMemcpyDeviceToDevice);
01207       volslab_d = gpuh->devdensity + fourplanes;
01208 
01209       if (colorperatom) {
01210         cudaMemcpy(gpuh->devvoltexmap,
01211                    ((unsigned char *) texslab_d) + (slabsz.z-4) * slabplanesz * vtexsize, 
01212                    fourplanes * vtexsize, cudaMemcpyDeviceToDevice);
01213         texslab_d = ((unsigned char *) gpuh->devvoltexmap) + fourplanes * vtexsize;
01214       }
01215     }
01216 
01217     // loop over the planes/slices in a slab and compute density and texture
01218     for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01219       int lzinc = lz * lzplane;
01220       float *volslice_d = volslab_d + lzinc * slabplanesz;
01221 
01222       if (colorperatom) {
01223         void *texslice_d = ((unsigned char *) texslab_d) + lzinc * slabplanesz * vtexsize;
01224         switch (voltexformat) {
01225           case RGB3F:
01226             gaussdensity_fast_tex3f<<<Gszslice, Bsz, 0>>>(natoms, 
01227                 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01228                 curslab, accelcells, acgridspacing, invacgridspacing, 
01229                 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01230                 volslice_d, (float3 *) texslice_d, invisovalue);
01231             break;
01232 
01233           case RGB4U:
01234             gaussdensity_fast_tex_norm<float, uchar4><<<Gszslice, Bsz, 0>>>(natoms, 
01235                 gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01236                 curslab, accelcells, acgridspacing, invacgridspacing, 
01237                 gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01238                 volslice_d, (uchar4 *) texslice_d, invisovalue);
01239             break;
01240         }
01241       } else {
01242         gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, gpuh->sorted_xyzr_d, 
01243             curslab, accelcells, acgridspacing, invacgridspacing, 
01244             gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01245       }
01246     }
01247     cudaDeviceSynchronize(); 
01248     densitykerneltime = wkf_timer_timenow(globaltimer);
01249 
01250 #if 0
01251     printf("  CUDA mcubes..."); fflush(stdout);
01252 #endif
01253 
01254     uint3 gvsz = make_uint3(curslab.x, curslab.y, curslab.z);
01255 
01256     // For all but the first density slab, we copy the last four
01257     // planes of the previous run into the start of the next run so
01258     // that we can extract the isosurface with no discontinuities
01259     if (z != 0)
01260       gvsz.z=curslab.z + 4;
01261 
01262     float3 bbox = make_float3(gvsz.x * gridspacing, gvsz.y * gridspacing,
01263                               gvsz.z * gridspacing);
01264 
01265     float3 gorigin = make_float3(origin[0], origin[1], 
01266                                  origin[2] + (z * gridspacing));
01267     if (z != 0)
01268       gorigin.z = origin[2] + ((z-4) * gridspacing);
01269 
01270 #if 0
01271 printf("\n  ... vsz: %d %d %d\n", gvsz.x, gvsz.y, gvsz.z);
01272 printf("  ... org: %.2f %.2f %.2f\n", gorigin.x, gorigin.y, gorigin.z);
01273 printf("  ... bxs: %.2f %.2f %.2f\n", bbox.x, bbox.y, bbox.y);
01274 printf("  ... bbe: %.2f %.2f %.2f\n", gorigin.x+bbox.x, gorigin.y+bbox.y, gorigin.z+bbox.z);
01275 #endif
01276 
01277     // If we are computing the volume using multiple passes, we have to 
01278     // overlap the marching cubes grids and compute a sub-volume to exclude
01279     // the end planes, except for the first and last sub-volume, in order to
01280     // get correct per-vertex normals at the edges of each sub-volume 
01281     int skipstartplane=0;
01282     int skipendplane=0;
01283     if (chunksz.z < volsz.z) {
01284       // on any but the first pass, we skip the first Z plane
01285       if (z != 0)
01286         skipstartplane=1;
01287 
01288       // on any but the last pass, we skip the last Z plane
01289       if (z+curslab.z < volsz.z)
01290         skipendplane=1;
01291     }
01292 
01293     //
01294     // Extract density map isosurface using marching cubes
01295     //
01296 
01297     // Choose the isovalue depending on whether the desnity map 
01298     // contains normalized or un-normalized density values
01299     if (colorperatom && (voltexformat == RGB4U)) {
01300       // incoming densities are pre-normalized so that the target isovalue
01301       // is represented as a density of 1.0f
01302       gpuh->mc->SetIsovalue(1.0f);
01303     } else {
01304       gpuh->mc->SetIsovalue(isovalue);
01305     }
01306 
01307     int mcstat = 0;
01308     switch (voltexformat) {
01309       case RGB3F:
01310         mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01311                                          (float3 *) gpuh->devvoltexmap,
01312                                          gvsz, gorigin, bbox, true);
01313         break;
01314 
01315       case RGB4U:
01316         mcstat = gpuh->mc->SetVolumeData(gpuh->devdensity, 
01317                                          (uchar4 *) gpuh->devvoltexmap,
01318                                          gvsz, gorigin, bbox, true);
01319         break;
01320     }
01321     if (!mcstat) {
01322       printf("QuickSurf call to MC SetVolumeData() failed\n");
01323     }
01324 
01325     // set the sub-volume starting/ending indices if needed
01326     if (skipstartplane || skipendplane) {
01327       uint3 volstart = make_uint3(0, 0, 0);
01328       uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01329 
01330       if (skipstartplane)
01331         volstart.z = 2;
01332 
01333       if (skipendplane)
01334         volend.z = gvsz.z - 2;
01335 
01336       gpuh->mc->SetSubVolume(volstart, volend);
01337     }
01338     if (gpuh->n3b_d) {
01339       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3b_d, 
01340                                   gpuh->c4u_d, chunkmaxverts);
01341     } else if (gpuh->c4u_d) {
01342       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01343                                   gpuh->c4u_d, chunkmaxverts);
01344     } else {
01345       gpuh->mc->computeIsosurface(gpuh->v3f_d, gpuh->n3f_d, 
01346                                   gpuh->c3f_d, chunkmaxverts);
01347     }
01348     chunknumverts = gpuh->mc->GetVertexCount();
01349 
01350 #if 0
01351     printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01352 #endif
01353     if (chunknumverts == chunkmaxverts)
01354       printf("  *** QuickSurf exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01355 
01356     cudaDeviceSynchronize(); 
01357     mckerneltime = wkf_timer_timenow(globaltimer);
01358 
01359     // Create a triangle mesh
01360     if (chunknumverts > 0) {
01361       DispCmdTriMesh cmdTriMesh;
01362       if (colorperatom) {
01363         // emit triangle mesh with per-vertex colors
01364         if (gpuh->n3b_d) {
01365           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01366                                   (const char *) gpuh->n3b_d, 
01367                                   (const unsigned char *) gpuh->c4u_d,
01368                                   chunknumverts/3, cmdList);
01369         } else if (gpuh->c4u_d) {
01370           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01371                                   (const float *) gpuh->n3f_d, 
01372                                   (const unsigned char *) gpuh->c4u_d,
01373                                   chunknumverts/3, cmdList);
01374         } else {
01375           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01376                                   (const float *) gpuh->n3f_d, 
01377                                   (const float *) gpuh->c3f_d,
01378                                   chunknumverts/3, cmdList);
01379         }
01380       } else {
01381         // emit triangle mesh with no colors, uses current rendering state
01382         if (gpuh->n3b_d) {
01383           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d, 
01384                                   (const char *) gpuh->n3b_d, 
01385                                   (const unsigned char *) NULL,
01386                                   chunknumverts/3, cmdList);
01387         } else {
01388           cmdTriMesh.cuda_putdata((const float *) gpuh->v3f_d,
01389                                   (const float *) gpuh->n3f_d, 
01390                                   (const float *) NULL,
01391                                   chunknumverts/3, cmdList);
01392         }
01393       }
01394     }
01395     numverts+=chunknumverts;
01396     numfacets+=chunknumverts/3;
01397 
01398 #if 0
01399    // XXX we'll hold onto this as we'll want to rescue this approach
01400    //     for electrostatics coloring where we have to have the 
01401    //     entire triangle mesh in order to do the calculation
01402     int l;
01403     int vertstart = 3 * numverts;
01404     int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01405     int facebufsz = (numverts + chunknumverts) * sizeof(int);
01406     int chunkvertsz = 3 * chunknumverts * sizeof(float);
01407 
01408     v = (float*) realloc(v, vertbufsz);
01409     n = (float*) realloc(n, vertbufsz);
01410     c = (float*) realloc(c, vertbufsz);
01411     f = (int*)   realloc(f, facebufsz);
01412     cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01413     cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01414     if (colorperatom) {
01415       cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01416     } else {
01417       float *color = c+vertstart;
01418       for (l=0; l<chunknumverts*3; l+=3) {
01419         color[l + 0] = colors[0].x;
01420         color[l + 1] = colors[0].y;
01421         color[l + 2] = colors[0].z;
01422       }
01423     }
01424     for (l=numverts; l<numverts+chunknumverts; l++) {
01425       f[l]=l;
01426     }
01427     numverts+=chunknumverts;
01428     numfacets+=chunknumverts/3;
01429 #endif
01430 
01431     copycalltime = wkf_timer_timenow(globaltimer);
01432 
01433     densitytime += densitykerneltime - lastlooptime;
01434     mctime += mckerneltime - densitykerneltime;
01435     copytime += copycalltime - mckerneltime;
01436 
01437     lastlooptime = wkf_timer_timenow(globaltimer);
01438 
01439     chunkcount++; // increment number of chunks processed
01440   }
01441 
01442   // catch any errors that may have occured so that at the very least,
01443   // all of the subsequent resource deallocation calls will succeed
01444   cudaError_t err = cudaGetLastError();
01445 
01446   wkf_timer_stop(globaltimer);
01447   double totalruntime = wkf_timer_time(globaltimer);
01448   wkf_timer_destroy(globaltimer);
01449 
01450   // If an error occured, we print it and return an error code, once
01451   // all of the memory deallocations have completed.
01452   if (err != cudaSuccess) { 
01453     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01454     return -1;
01455   }
01456 
01457   if (gpuh->verbose) {
01458     printf("  GPU generated %d vertices, %d facets, in %d passes\n", 
01459            numverts, numfacets, chunkcount);
01460     printf("  GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n", 
01461            "SM >= 2.x", totalruntime, sorttime, densitytime, mctime, copytime);
01462   }
01463 
01464   return 0;
01465 }
01466 
01467 
01468 
01469 
01470