00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "Inform.h"
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h" 
00030 #include "Measure.h"
00031 
00032 
00033 
00034 
00035 
00036 #if 1
00037 #define RESTRICT __restrict__
00038 #else
00039 #define RESTRICT
00040 #endif
00041 
00042 
00043 
00044 #if defined(VMDUSECUDAGDS)
00045 #include </usr/local/gds-beta-0.7.1/lib/cufile.h>        
00046 
00047 
00048 
00049 #define VMDJSPLUGININCLUDESRC 1
00050 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00051 #endif
00052 
00053 
00054 #define CUERR { cudaError_t err; \
00055   if ((err = cudaGetLastError()) != cudaSuccess) { \
00056   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00057   return -1; }}
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 
00068 #define FMADD16 \
00069     tmp0  = tmp0*tmp4+tmp7;     \
00070     tmp1  = tmp1*tmp5+tmp0;     \
00071     tmp2  = tmp2*tmp6+tmp1;     \
00072     tmp3  = tmp3*tmp7+tmp2;     \
00073     tmp4  = tmp4*tmp0+tmp3;     \
00074     tmp5  = tmp5*tmp1+tmp4;     \
00075     tmp6  = tmp6*tmp2+tmp5;     \
00076     tmp7  = tmp7*tmp3+tmp6;     \
00077     tmp8  = tmp8*tmp12+tmp15;   \
00078     tmp9  = tmp9*tmp13+tmp8;    \
00079     tmp10 = tmp10*tmp14+tmp9;   \
00080     tmp11 = tmp11*tmp15+tmp10;  \
00081     tmp12 = tmp12*tmp8+tmp11;   \
00082     tmp13 = tmp13*tmp9+tmp12;   \
00083     tmp14 = tmp14*tmp10+tmp13;  \
00084     tmp15 = tmp15*tmp11+tmp14;
00085 
00086 
00087 #define GRIDSIZEX       6144  // number of 1-D thread blocks
00088 #define BLOCKSIZEX      64    // number of threads per 1-D block
00089 #define GLOOPS          2000  // iteration count (all threads)
00090 #define FMADD16COUNT    32    // 32 reps
00091 #define FLOPSPERFMADD16 32    // 16 MULs and 16 ADDs
00092 
00093 
00094 #define FLOPSPERLOOP (FMADD16COUNT * FLOPSPERFMADD16)
00095 
00096 __global__ static void madd_kernel(float *doutput) {
00097   int tid = blockIdx.x * blockDim.x + threadIdx.x;
00098   float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00099   float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00100   tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00101   tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00102 
00103   tmp15=tmp7 = blockIdx.x * 0.001f; 
00104   tmp1 = blockIdx.y * 0.001f;       
00105 
00106   int loop;
00107   for(loop=0; loop<GLOOPS; loop++){
00108     FMADD16
00109     FMADD16
00110     FMADD16
00111     FMADD16
00112     FMADD16
00113     FMADD16
00114     FMADD16
00115     FMADD16
00116     FMADD16
00117     FMADD16
00118     FMADD16
00119     FMADD16
00120     FMADD16
00121     FMADD16
00122     FMADD16
00123     FMADD16
00124     FMADD16
00125     FMADD16
00126     FMADD16
00127     FMADD16
00128     FMADD16
00129     FMADD16
00130     FMADD16
00131     FMADD16
00132     FMADD16
00133     FMADD16
00134     FMADD16
00135     FMADD16
00136     FMADD16
00137     FMADD16
00138     FMADD16
00139     FMADD16
00140   }
00141 
00142   doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00143                  +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00144 }
00145 
00146 
00147 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00148   float *doutput = NULL;
00149   dim3 Bsz, Gsz;
00150   wkf_timerhandle timer;
00151   int i;
00152 
00153   cudaError_t rc;
00154   rc = cudaSetDevice(cudadev);
00155   if (rc != cudaSuccess) {
00156 #if CUDART_VERSION >= 2010
00157     rc = cudaGetLastError(); 
00158     if (rc != cudaErrorSetOnActiveProcess)
00159       return -1; 
00160 #else
00161     cudaGetLastError(); 
00162                         
00163 #endif
00164   }
00165 
00166 
00167   
00168   Bsz.x = BLOCKSIZEX;
00169   Bsz.y = 1;
00170   Bsz.z = 1;
00171   Gsz.x = GRIDSIZEX;
00172   Gsz.y = 1;
00173   Gsz.z = 1;
00174 
00175   
00176   cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00177   CUERR 
00178 
00179   
00180   madd_kernel<<<Gsz, Bsz>>>(doutput);
00181   cudaDeviceSynchronize(); 
00182 
00183   
00184   timer=wkf_timer_create();
00185   wkf_timer_start(timer);
00186   for (i=0; i<testloops; i++) { 
00187     madd_kernel<<<Gsz, Bsz>>>(doutput);
00188   }
00189   cudaDeviceSynchronize(); 
00190   CUERR 
00191   wkf_timer_stop(timer);
00192 
00193   double runtime = wkf_timer_time(timer);
00194   double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00195                   ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00196   
00197   *gflops = gflop / runtime;
00198 
00199   cudaFree(doutput);
00200   CUERR 
00201 
00202   wkf_timer_destroy(timer);
00203 
00204   return 0;
00205 }
00206 
00207 typedef struct {
00208   int deviceid;
00209   int testloops;
00210   double gflops;
00211 } maddthrparms;
00212 
00213 static void * cudamaddthread(void *voidparms) {
00214   maddthrparms *parms = (maddthrparms *) voidparms;
00215   cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00216   return NULL;
00217 }
00218 
00219 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00220                          int testloops) {
00221   maddthrparms *parms;
00222   wkf_thread_t * threads;
00223   int i;
00224 
00225   
00226   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00227 
00228   
00229   parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00230   for (i=0; i<numdevs; i++) {
00231     if (devlist != NULL)
00232       parms[i].deviceid = devlist[i];
00233     else
00234       parms[i].deviceid = i;
00235 
00236     parms[i].testloops = testloops;
00237     parms[i].gflops = 0.0;
00238   }
00239 
00240 #if defined(VMDTHREADS)
00241   
00242   
00243   
00244   for (i=0; i<numdevs; i++) {
00245     wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00246   }
00247 
00248   
00249   for (i=0; i<numdevs; i++) {
00250     wkf_thread_join(threads[i], NULL);
00251   }
00252 #else
00253   
00254   cudamaddthread((void *) &parms[0]);
00255 #endif
00256 
00257   for (i=0; i<numdevs; i++) {
00258     gflops[i] = parms[i].gflops; 
00259   }
00260 
00261   
00262   free(parms);
00263   free(threads);
00264 
00265   return 0;
00266 }
00267 
00268 
00269 
00270 
00271 
00272 
00273 
00274 
00275 
00276 
00277 #define BWITER 500
00278 #define LATENCYITER 50000
00279 
00280 static int cudabusbw(int cudadev, 
00281                      double *hdmbsec, double *hdlatusec, 
00282                      double *phdmbsec, double *phdlatusec, 
00283                      double *dhmbsec, double *dhlatusec,
00284                      double *pdhmbsec, double *pdhlatusec) {
00285   float *hdata = NULL;   
00286   float *phdata = NULL;  
00287   float *ddata = NULL;
00288   int i;
00289   double runtime;
00290   wkf_timerhandle timer;
00291   int memsz = 1024 * 1024 * sizeof(float);
00292 
00293   *hdmbsec = 0.0;
00294   *hdlatusec = 0.0;
00295   *dhmbsec = 0.0;
00296   *dhlatusec = 0.0;
00297   *phdmbsec = 0.0;
00298   *phdlatusec = 0.0;
00299   *pdhmbsec = 0.0;
00300   *pdhlatusec = 0.0;
00301 
00302   
00303   cudaError_t rc;
00304   rc = cudaSetDevice(cudadev);
00305   if (rc != cudaSuccess) {
00306 #if CUDART_VERSION >= 2010
00307     rc = cudaGetLastError(); 
00308     if (rc != cudaErrorSetOnActiveProcess)
00309       return -1; 
00310 #else
00311     cudaGetLastError(); 
00312                         
00313 #endif
00314   }
00315 
00316   
00317   hdata = (float *) malloc(memsz); 
00318 
00319   
00320   cudaMallocHost((void**) &phdata, memsz);
00321   CUERR 
00322 
00323   
00324   cudaMalloc((void**) &ddata, memsz);
00325   CUERR 
00326 
00327   
00328   timer=wkf_timer_create();
00329 
00330   
00331   
00332   
00333 
00334   
00335   wkf_timer_start(timer);
00336   for (i=0; i<BWITER; i++) {
00337     cudaMemcpy(ddata, hdata, memsz,  cudaMemcpyHostToDevice);
00338   }
00339   wkf_timer_stop(timer);
00340   CUERR 
00341   runtime = wkf_timer_time(timer);
00342   *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00343 
00344   
00345   wkf_timer_start(timer);
00346   for (i=0; i<LATENCYITER; i++) {
00347     cudaMemcpy(ddata, hdata, 1,  cudaMemcpyHostToDevice);
00348   }
00349   wkf_timer_stop(timer);
00350   CUERR 
00351   runtime = wkf_timer_time(timer);
00352   *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00353 
00354 
00355   
00356   wkf_timer_start(timer);
00357   for (i=0; i<BWITER; i++) {
00358     cudaMemcpy(ddata, phdata, memsz,  cudaMemcpyHostToDevice);
00359   }
00360   wkf_timer_stop(timer);
00361   CUERR 
00362   runtime = wkf_timer_time(timer);
00363   *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00364 
00365   
00366   wkf_timer_start(timer);
00367   for (i=0; i<LATENCYITER; i++) {
00368     cudaMemcpy(ddata, phdata, 1,  cudaMemcpyHostToDevice);
00369   }
00370   wkf_timer_stop(timer);
00371   CUERR 
00372   runtime = wkf_timer_time(timer);
00373   *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00374 
00375  
00376   
00377   
00378   
00379 
00380   
00381   wkf_timer_start(timer);
00382   for (i=0; i<BWITER; i++) {
00383     cudaMemcpy(hdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00384   }
00385   wkf_timer_stop(timer);
00386   CUERR 
00387   runtime = wkf_timer_time(timer);
00388   *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00389 
00390   
00391   wkf_timer_start(timer);
00392   for (i=0; i<LATENCYITER; i++) {
00393     cudaMemcpy(hdata, ddata, 1,  cudaMemcpyDeviceToHost);
00394   }
00395   wkf_timer_stop(timer);
00396   CUERR 
00397   runtime = wkf_timer_time(timer);
00398   *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00399 
00400 
00401   
00402   wkf_timer_start(timer);
00403   for (i=0; i<BWITER; i++) {
00404     cudaMemcpy(phdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00405   }
00406   wkf_timer_stop(timer);
00407   CUERR 
00408   runtime = wkf_timer_time(timer);
00409   *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00410 
00411   
00412   wkf_timer_start(timer);
00413   for (i=0; i<LATENCYITER; i++) {
00414     cudaMemcpy(phdata, ddata, 1,  cudaMemcpyDeviceToHost);
00415   }
00416   wkf_timer_stop(timer);
00417   CUERR 
00418   runtime = wkf_timer_time(timer);
00419   *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00420  
00421  
00422   cudaFree(ddata);
00423   CUERR 
00424   cudaFreeHost(phdata);
00425   CUERR 
00426   free(hdata);
00427 
00428   wkf_timer_destroy(timer);
00429 
00430   return 0;
00431 }
00432 
00433 typedef struct {
00434   int deviceid;
00435   double hdmbsec;
00436   double hdlatusec;
00437   double phdmbsec;
00438   double phdlatusec;
00439   double dhmbsec;
00440   double dhlatusec;
00441   double pdhmbsec;
00442   double pdhlatusec;
00443 } busbwthrparms;
00444 
00445 static void * cudabusbwthread(void *voidparms) {
00446   busbwthrparms *parms = (busbwthrparms *) voidparms;
00447   cudabusbw(parms->deviceid, 
00448             &parms->hdmbsec, &parms->hdlatusec,
00449             &parms->phdmbsec, &parms->phdlatusec,
00450             &parms->dhmbsec, &parms->dhlatusec,
00451             &parms->pdhmbsec, &parms->pdhlatusec);
00452   return NULL;
00453 }
00454 
00455 int vmd_cuda_bus_bw(int numdevs, int *devlist, 
00456                     double *hdmbsec, double *hdlatusec,
00457                     double *phdmbsec,double *phdlatusec,
00458                     double *dhmbsec, double *dhlatusec,
00459                     double *pdhmbsec, double *pdhlatusec) {
00460   busbwthrparms *parms;
00461   wkf_thread_t * threads;
00462   int i;
00463 
00464   
00465   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00466 
00467   
00468   parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00469   for (i=0; i<numdevs; i++) {
00470     if (devlist != NULL)
00471       parms[i].deviceid = devlist[i];
00472     else
00473       parms[i].deviceid = i;
00474     parms[i].hdmbsec = 0.0;
00475     parms[i].hdlatusec = 0.0;
00476     parms[i].phdmbsec = 0.0;
00477     parms[i].phdlatusec = 0.0;
00478     parms[i].dhmbsec = 0.0;
00479     parms[i].dhlatusec = 0.0;
00480     parms[i].pdhmbsec = 0.0;
00481     parms[i].pdhlatusec = 0.0;
00482   }
00483 
00484 #if defined(VMDTHREADS)
00485   
00486   
00487   
00488   for (i=0; i<numdevs; i++) {
00489     wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00490   }
00491 
00492   
00493   for (i=0; i<numdevs; i++) {
00494     wkf_thread_join(threads[i], NULL);
00495   }
00496 #else
00497   
00498   cudabusbwthread((void *) &parms[0]);
00499 #endif
00500 
00501   for (i=0; i<numdevs; i++) {
00502     hdmbsec[i] = parms[i].hdmbsec; 
00503     hdlatusec[i] = parms[i].hdlatusec; 
00504     phdmbsec[i] = parms[i].phdmbsec; 
00505     phdlatusec[i] = parms[i].phdlatusec; 
00506     dhmbsec[i] = parms[i].dhmbsec; 
00507     dhlatusec[i] = parms[i].dhlatusec; 
00508     pdhmbsec[i] = parms[i].pdhmbsec; 
00509     pdhlatusec[i] = parms[i].pdhlatusec; 
00510   }
00511 
00512   
00513   free(parms);
00514   free(threads);
00515 
00516   return 0;
00517 }
00518 
00519 
00520 
00521 
00522 
00523 
00524 template <class T>
00525 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00526   const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00527   dest[idx] = src[idx];
00528 }
00529 
00530 template <class T>
00531 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00532   int idx = threadIdx.x + blockIdx.x * blockDim.x;
00533   dest[idx] = val;
00534 }
00535 
00536 typedef float4 datatype;
00537 
00538 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00539   int i;
00540   int len = 1 << 22; 
00541   int loops = 500;
00542   datatype *src, *dest;
00543   datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00544 
00545   
00546   float memsettime = 0.0f;
00547   float memcpytime = 0.0f;
00548   *gpumemsetgbsec = 0.0;
00549   *gpumemcpygbsec = 0.0;
00550 
00551   
00552   cudaError_t rc;
00553   rc = cudaSetDevice(cudadev);
00554   if (rc != cudaSuccess) {
00555 #if CUDART_VERSION >= 2010
00556     rc = cudaGetLastError(); 
00557     if (rc != cudaErrorSetOnActiveProcess)
00558       return -1; 
00559 #else
00560     cudaGetLastError(); 
00561                         
00562 #endif
00563   }
00564 
00565   cudaMalloc((void **) &src, sizeof(datatype)*len);
00566   CUERR
00567   cudaMalloc((void **) &dest, sizeof(datatype)*len);
00568   CUERR
00569 
00570   dim3 BSz(256, 1, 1);
00571   dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 
00572 
00573   
00574   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00575   CUERR
00576   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00577   CUERR
00578   gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00579   CUERR
00580 
00581   cudaEvent_t start, end;
00582   cudaEventCreate(&start);
00583   cudaEventCreate(&end);
00584 
00585   
00586   cudaEventRecord(start, 0);
00587   for (i=0; i<loops; i++) {
00588     gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00589   }
00590   CUERR
00591   cudaEventRecord(end, 0);
00592   CUERR
00593   cudaEventSynchronize(start);
00594   CUERR
00595   cudaEventSynchronize(end);
00596   CUERR
00597   cudaEventElapsedTime(&memsettime, start, end);
00598   CUERR
00599 
00600   
00601   cudaEventRecord(start, 0);
00602   for (i=0; i<loops; i++) {
00603     gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00604   }
00605   cudaEventRecord(end, 0);
00606   CUERR
00607   cudaEventSynchronize(start);
00608   CUERR
00609   cudaEventSynchronize(end);
00610   CUERR
00611   cudaEventElapsedTime(&memcpytime, start, end);
00612   CUERR
00613 
00614   cudaEventDestroy(start);
00615   CUERR
00616   cudaEventDestroy(end);
00617   CUERR
00618 
00619   *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00620   *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00621   cudaFree(dest);
00622   cudaFree(src);
00623   CUERR
00624 
00625   return 0;
00626 }
00627 
00628 typedef struct {
00629   int deviceid;
00630   double memsetgbsec;
00631   double memcpygbsec;
00632 } globmembwthrparms;
00633 
00634 static void * cudaglobmembwthread(void *voidparms) {
00635   globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00636   cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00637   return NULL;
00638 }
00639 
00640 int vmd_cuda_globmem_bw(int numdevs, int *devlist, 
00641                         double *memsetgbsec, double *memcpygbsec) {
00642   globmembwthrparms *parms;
00643   wkf_thread_t * threads;
00644   int i;
00645 
00646   
00647   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00648 
00649   
00650   parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00651   for (i=0; i<numdevs; i++) {
00652     if (devlist != NULL)
00653       parms[i].deviceid = devlist[i];
00654     else
00655       parms[i].deviceid = i;
00656     parms[i].memsetgbsec = 0.0;
00657     parms[i].memcpygbsec = 0.0;
00658   }
00659 
00660 #if defined(VMDTHREADS)
00661   
00662   
00663   
00664   for (i=0; i<numdevs; i++) {
00665     wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00666   }
00667 
00668   
00669   for (i=0; i<numdevs; i++) {
00670     wkf_thread_join(threads[i], NULL);
00671   }
00672 #else
00673   
00674   cudaglobmembwthread((void *) &parms[0]);
00675 #endif
00676 
00677   for (i=0; i<numdevs; i++) {
00678     memsetgbsec[i] = parms[i].memsetgbsec;
00679     memcpygbsec[i] = parms[i].memcpygbsec;
00680   }
00681 
00682   
00683   free(parms);
00684   free(threads);
00685 
00686   return 0;
00687 }
00688 
00689 
00690 
00691 
00692 
00693 static void * vmddevpoollatencythread(void *voidparms) {
00694   return NULL;
00695 }
00696 
00697 static void * vmddevpooltilelatencythread(void *voidparms) {
00698   int threadid=-1;
00699   int tilesize=1;
00700   void *parms=NULL;
00701   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00702   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00703 
00704   
00705   wkf_tasktile_t tile;
00706   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00707     
00708   }
00709 
00710   return NULL;
00711 }
00712 
00713 
00714 
00715 __global__ static void nopkernel(float * ddata) {
00716   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
00717   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00718   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00719 
00720   if (ddata != NULL)
00721     ddata[outaddr] = outaddr;
00722 }
00723 
00724 
00725 __global__ static void voidkernel(void) {
00726   return;
00727 }
00728 
00729 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00730   int threadid=-1;
00731   int tilesize=1;
00732   float *parms=NULL;
00733   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00734 
00735   
00736   
00737   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00738 
00739 #if 0
00740   
00741   tilesize=4; 
00742   wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00743 #endif
00744 
00745   
00746   wkf_tasktile_t tile;
00747   dim3 Gsz(1,1,0);
00748   dim3 Bsz(8,8,1);
00749   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00750     
00751     nopkernel<<<Gsz, Bsz, 0>>>(parms);
00752   }
00753 
00754   
00755   cudaDeviceSynchronize();
00756 
00757   return NULL;
00758 }
00759 
00760 
00761 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00762                              double *kernlaunchlatency,
00763                              double *barlatency,
00764                              double *cyclelatency, 
00765                              double *tilelatency,
00766                              double *kernellatency) {
00767   int i;
00768   wkf_tasktile_t tile;
00769   wkf_timerhandle timer;
00770   int loopcount;
00771 
00772   timer=wkf_timer_create();
00773 
00774   
00775   
00776   loopcount = 15000;
00777   dim3 VGsz(1,1,0);
00778   dim3 VBsz(8,8,1);
00779   wkf_timer_start(timer);
00780   for (i=0; i<loopcount; i++) {
00781     voidkernel<<<VGsz, VBsz, 0>>>();
00782   }
00783   
00784   cudaDeviceSynchronize();
00785   wkf_timer_stop(timer);
00786   *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00787 
00788   
00789   loopcount = 15000;
00790   wkf_timer_start(timer);
00791   for (i=0; i<loopcount; i++) {
00792     wkf_threadpool_wait(devpool);
00793   }
00794   wkf_timer_stop(timer);
00795   *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00796 
00797   
00798   loopcount = 5000;
00799   wkf_timer_start(timer);
00800   for (i=0; i<loopcount; i++) {
00801     tile.start=0;
00802     tile.end=0;
00803     wkf_threadpool_sched_dynamic(devpool, &tile);
00804     wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00805   }
00806   wkf_timer_stop(timer);
00807   *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00808 
00809   
00810   loopcount = 5000;
00811   wkf_timer_start(timer);
00812   for (i=0; i<loopcount; i++) {
00813     tile.start=0;
00814     tile.end=tilesize;
00815     wkf_threadpool_sched_dynamic(devpool, &tile);
00816     wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00817   }
00818   wkf_timer_stop(timer);
00819   *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00820 
00821   
00822   loopcount = 2000;
00823   wkf_timer_start(timer);
00824   for (i=0; i<loopcount; i++) {
00825     tile.start=0;
00826     tile.end=tilesize;
00827     wkf_threadpool_sched_dynamic(devpool, &tile);
00828     wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00829   }
00830   wkf_timer_stop(timer);
00831   *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00832 
00833   wkf_timer_destroy(timer);
00834 
00835 #if 1
00836   vmd_cuda_measure_latencies(devpool);
00837 #endif
00838 
00839   return 0;
00840 }
00841 
00842 
00843 
00844 
00845 
00846 typedef struct {
00847   int deviceid;
00848   int testloops;
00849   double kernlatency;
00850   double bcopylatency;
00851   double kbseqlatency;
00852 } latthrparms;
00853 
00854 static void * vmddevpoolcudalatencythread(void *voidparms) {
00855   int threadid=-1;
00856   latthrparms *parms=NULL;
00857 
00858   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00859   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00860   if (parms->deviceid == threadid) { 
00861     wkf_timerhandle timer;
00862     timer=wkf_timer_create();
00863     printf("Thread/device %d running...\n", threadid);
00864     cudaStream_t devstream;
00865     cudaStreamCreate(&devstream);
00866 
00867     char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00868     char  *gpubuf = NULL;
00869     cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00870 
00871     dim3 Gsz(1,1,0);
00872     dim3 Bsz(8,8,1);
00873 
00874     
00875     wkf_timer_start(timer);
00876     int i;
00877     for (i=0; i<parms->testloops; i++) {
00878       
00879       nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00880     }
00881     
00882     cudaStreamSynchronize(devstream);
00883     wkf_timer_stop(timer);
00884     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00885 
00886     
00887     wkf_timer_start(timer);
00888     for (i=0; i<parms->testloops; i++) {
00889       cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00890       cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00891     }
00892     
00893     cudaStreamSynchronize(devstream);
00894     wkf_timer_stop(timer);
00895     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00896 
00897     printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00898 
00899     cudaStreamDestroy(devstream);
00900     cudaFree(gpubuf);
00901     free(hostbuf);
00902     wkf_timer_destroy(timer);
00903   }
00904 
00905   return NULL;
00906 }
00907 
00908 
00909 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00910   latthrparms thrparms;
00911   int workers = wkf_threadpool_get_workercount(devpool);
00912   int i;
00913 printf("vmd_cuda_measure_latencies()...\n");
00914   for (i=0; i<workers; i++) {
00915     memset(&thrparms, 0, sizeof(thrparms));
00916     thrparms.deviceid = i;
00917     thrparms.testloops = 2500;
00918     wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00919   }
00920 
00921   return 0;
00922 }
00923 
00924 
00925 #if defined(VMDUSECUDAGDS)
00926 typedef struct {
00927   int nfiles;
00928   const char **trjfileset;
00929   jshandle **jshandles;
00930   CUfileHandle_t *cfh;
00931   int devcount;
00932   int natoms;
00933   const AtomSel *sel;
00934   int first;
00935   int last;
00936   int step;
00937 } gpuoocbenchthreadparms;
00938 
00939 #define VMDGDSMAXFRAMEBUF     8
00940 static void * gpu_ooc_bench_thread(void *voidparms) {
00941   int threadid, numthreads;
00942   gpuoocbenchthreadparms *parms = NULL;
00943   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00944   wkf_threadpool_worker_getid(voidparms, &threadid, &numthreads);
00945 
00946   
00947   
00948   
00949   int nfiles = parms->nfiles;
00950   int natoms = parms->natoms;
00951   const AtomSel *sel = parms->sel;
00952   int first  = parms->first;
00953   int last   = parms->last;
00954   int step   = parms->step;
00955 
00956   int usecufile = 1;
00957   fio_fd *hostfds = NULL;
00958   int pinhostiobuffer = 1;
00959   if (getenv("VMDGDSHOSTNOPIN")) {
00960     pinhostiobuffer=0;
00961   }
00962 
00963   if (getenv("VMDGDSUSEHOST")) {
00964     usecufile=0;
00965     hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00966 
00967     int hostusedirectio = 1;
00968     if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00969       hostusedirectio = 0;
00970 
00971     int openmode = FIO_READ;
00972     if (hostusedirectio)
00973       openmode |= FIO_DIRECT;
00974 
00975     int i;
00976     for (i=0; i<nfiles; i++) {
00977       if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00978         if (hostusedirectio) {
00979           printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n",
00980                  threadid, parms->trjfileset[i]);
00981         } else {
00982           printf("Thr[%d] can't open file '%s'\n",
00983                  threadid, parms->trjfileset[i]);
00984         }
00985         return NULL;
00986       }
00987     }
00988   }
00989 
00990   if (hostfds && usecufile) {
00991     printf("Inconsistent cufile/hostfds state, aborting!\n");
00992     return NULL;
00993   }
00994 
00995   
00996   
00997   long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00998   long sz = 3L*sizeof(float)*natoms + blocksz;
00999 
01000   
01001   size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
01002 
01003   int framecount = (last - first + 1) / step;
01004   int framesperfile = framecount / nfiles;
01005 
01006   if (threadid == 0) {
01007     printf("Thr[%2d] %d frames total, natoms: %d selected: %d\n",
01008            threadid, framecount, natoms, sel->selected);
01009     printf("Thr[%2d] %d frames/file\n", threadid, framesperfile);
01010   }
01011 
01012   cudaError_t crc;
01013   cudaStream_t oocstream;
01014   float *devptr=NULL;
01015   float *hostptr=NULL;
01016   float *hostptr_unaligned=NULL;
01017 
01018   float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
01019   float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
01020   int multiframeio = 0;
01021   if (getenv("VMDGDSMULTIFRAME"))
01022     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01023   if (multiframeio > VMDGDSMAXFRAMEBUF)
01024     multiframeio = VMDGDSMAXFRAMEBUF;
01025 
01026   
01027   dim3 IOBsz = dim3(256, 1, 1);
01028   dim3 IOGsz = dim3((natoms + IOBsz.x - 1) / IOBsz.x, 1, 1);
01029 
01030   if (parms->devcount > 0) {
01031     long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
01032 
01033     if (threadid == 0) {
01034       printf("Thr[%2d] Allocating GPU timestep I/O buf: %ld \n",
01035              threadid, gpuallocsz);
01036     }
01037     crc = cudaMalloc((void**) &devptr, gpuallocsz);
01038 
01039     if (hostfds != NULL) {
01040       if (pinhostiobuffer) {
01041         crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
01042       } else {
01043         hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096,
01044                                               (void**) &hostptr_unaligned);
01045         if (!hostptr) {
01046           printf("Thr[%d]: Failed allocation!\n", threadid);
01047           return NULL;
01048         }
01049       }
01050     }
01051 
01052     long crdsz = sel->selected * sizeof(float);
01053 
01054     
01055     crc = cudaMalloc((void**) &crdx1, crdsz);
01056     crc = cudaMalloc((void**) &crdy1, crdsz);
01057     crc = cudaMalloc((void**) &crdz1, crdsz);
01058     crc = cudaMalloc((void**) &crdx2, crdsz);
01059     crc = cudaMalloc((void**) &crdy2, crdsz);
01060     crc = cudaMalloc((void**) &crdz2, crdsz);
01061     if (crc != cudaSuccess) {
01062       printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
01063       return NULL; 
01064     }
01065 
01066     cudaStreamCreate(&oocstream);
01067 
01068 #if defined(VMDUSECUDAGDS)
01069     cuFileBufRegister(devptr, gpuallocsz, 0);
01070 #endif
01071   }
01072 
01073   int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
01074   
01075   int filestrategy = 0;
01076   if (getenv("VMDGDSFILESTRATEGY")) {
01077     filestrategy = atoi(getenv("VMDGDSFILESTRATEGY"));
01078   }
01079   if (threadid == 0) {
01080     printf("Thr[%2d] file strategy set to: %d\n", threadid, filestrategy);
01081   }
01082 
01083   wkf_tasktile_t tile;
01084   while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
01085     
01086     
01087     
01088     int idx;
01089     int threadspergpu;
01090     if (parms->devcount > 0)
01091       threadspergpu = numthreads / parms->devcount;
01092     else 
01093       threadspergpu = 1;
01094 
01095     for (idx=tile.start; idx<tile.end; idx++) {
01096       int myfileidx, fileframeidx;
01097 
01098       switch (filestrategy) {
01099         case 1:
01100           myfileidx = (idx / multiframeio) % nfiles;
01101           fileframeidx = idx % framesperfile;
01102           break;
01103 
01104         case 2:
01105           myfileidx = (idx / (multiframeio * threadspergpu)) % nfiles;
01106           fileframeidx = idx % framesperfile;
01107           break;
01108 
01109         case 3:
01110           myfileidx = (threadid / 4) % nfiles;
01111           fileframeidx = idx % framesperfile;
01112           break;
01113 
01114         case 0:
01115         default:
01116           myfileidx = (threadid / threadspergpu) % nfiles;
01117           fileframeidx = idx % framesperfile;
01118           break;
01119       }
01120 
01121       
01122       
01123       
01124       long startoffset, foffset, readlen;
01125       read_js_timestep_index_offsets(parms->jshandles[myfileidx],
01126                                      natoms, fileframeidx, 0, natoms, NULL,
01127                                      &startoffset, &foffset, &readlen);
01128       if (multiframeio) {
01129         
01130         
01131         
01132         long multistartoffset, multifoffset, multireadlen;
01133         read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
01134                                        fileframeidx+multiframeio-1,
01135                                        0, natoms, NULL,
01136                                        &multistartoffset, &multifoffset,
01137                                        &multireadlen);
01138 
01139         multireadlen = (multifoffset + multireadlen) - foffset;
01140 
01141         
01142         readlen = multireadlen;
01143         idx+=multiframeio-1; 
01144       }
01145 
01146       
01147       
01148       
01149       long ret=0;
01150       if (usecufile) {
01151         ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
01152       } else if (hostfds) {
01153         foffset=0;
01154         ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
01155         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01156         ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
01157         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01158         cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
01159       } else {
01160         printf("Inconsistent cufile/hostfds state, aborting!\n");
01161         return NULL;
01162       }
01163 
01164       
01165       if (ret < 0) {
01166         printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
01167         return NULL; 
01168       }
01169 
01170       if (verbose) {
01171         printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d cuFile len: %ld off: %ld\n",
01172                threadid, myfileidx, tile.start, tile.end, idx, 
01173                readlen, foffset);
01174       }
01175     }
01176   }
01177 
01178   cudaFree(crdx1);
01179   cudaFree(crdy1);
01180   cudaFree(crdz1);
01181   cudaFree(crdx2);
01182   cudaFree(crdy2);
01183   cudaFree(crdz2);
01184 
01185 #if defined(VMDUSECUDAGDS)
01186   if (usecufile) {
01187     cuFileBufDeregister(devptr);
01188   }
01189 #endif
01190 
01191   if (hostfds != NULL) {
01192     int i;
01193     for (i=0; i<nfiles; i++) {
01194       fio_fclose(hostfds[i]);
01195     }
01196     free(hostfds);
01197   }
01198 
01199   if (hostptr != NULL) {
01200     if (pinhostiobuffer) {
01201       cudaFreeHost(hostptr);
01202     } else {
01203       free(hostptr_unaligned);
01204     }
01205   }
01206 
01207   return NULL;
01208 }
01209 
01210 #endif
01211 
01212 
01213 int gpu_ooc_bench(wkf_threadpool_t *devpool, 
01214                   int nfiles, const char **trjfileset, const AtomSel *sel,
01215                   int first, int last, int step) {
01216   printf("gpu_ooc_bench()\n");
01217   wkf_threadpool_t *bigpool = NULL;
01218 
01219 #if defined(VMDUSECUDAGDS)
01220   int devcount;
01221   cudaError_t crc = cudaGetDeviceCount(&devcount);
01222   printf("gpu_ooc_bench) GPU device count: %d\n", devcount);
01223   if (devcount==0)
01224     printf("gpu_ooc_bench)   No GPU devices, continuing with host only...\n");
01225 
01226   CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01227   CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t));
01228   memset(&cfh[0], 0, sizeof(cfh));
01229   memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01230 
01231   int natomschk = 0;
01232   jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01233   fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01234 
01235   int i;
01236   for (i=0; i<nfiles; i++) {
01237     const char *filename = trjfileset[i];
01238     printf("gpu_ooc_bench) File[%d] GDS setup, opening '%s'\n", i, filename);
01239     jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01240     if (!jshandles[i]) {
01241       printf("gpu_ooc_bench) File[%d] open_js_read failed for file %s\n", i, filename);
01242       return -1; 
01243     }
01244 
01245 #if vmdplugin_ABIVERSION > 17
01246     long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01247     int filepgalignsz = 1;
01248     read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01249     if (filepgalignsz != blocksz) {
01250       printf("gpu_ooc_bench) File[%d] Plugin-returned page alignment size mismatch!\n", i);
01251     } else {
01252       printf("gpu_ooc_bench) File[%d] Page alignment size: %d\n", i, filepgalignsz);
01253     }
01254 #endif
01255 
01256     read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0,
01257                                    &directio_fds[i], NULL, NULL, NULL);
01258 
01259     cfhdesc[i].handle.fd = directio_fds[i]; 
01260     cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01261     CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01262 
01263     if (cferr.err != CU_FILE_SUCCESS) {
01264       printf("gpu_ooc_bench) File[%d] cuFileImportExternalFile on fd %d failed!\n",
01265              i, cfhdesc[i].handle.fd);
01266       return -1; 
01267     }
01268   }
01269 
01270 
01271   
01272   
01273   
01274   gpuoocbenchthreadparms parms;
01275   memset(&parms, 0, sizeof(parms));
01276   parms.devcount = devcount;
01277   parms.nfiles = nfiles;
01278   parms.trjfileset = trjfileset;
01279   parms.jshandles = jshandles;
01280   parms.cfh = cfh;
01281   parms.natoms = sel->num_atoms;
01282   parms.sel = sel;
01283   parms.first = first;
01284   parms.last = last;
01285   parms.step = step;
01286 
01287   int framecount = nfiles * (last / step);
01288 
01289   
01290   wkf_timerhandle timer;
01291   timer=wkf_timer_create();
01292 
01293   
01294   wkf_tasktile_t tile;
01295   tile.start=0;
01296   tile.end=framecount - 1; 
01297 
01298   printf("gpu_ooc_bench) tile start: %d  end: %d\n", tile.start, tile.end);
01299 
01300   int gdsthreadspergpu = 1;
01301   if (getenv("VMDGDSTHREADSPERGPU") != NULL)
01302     gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01303 
01304   printf("gpu_ooc_bench) gdsthreadspergpu: %d\n", gdsthreadspergpu);
01305 
01306   if (gdsthreadspergpu > 1) {
01307     
01308     int workercount = devcount * gdsthreadspergpu;
01309 
01310     int *devlist = new int[workercount];
01311     int k;
01312     for (k=0; k<workercount; k++) {
01313       devlist[k] = k / gdsthreadspergpu; 
01314     }
01315 
01316     msgInfo << "Creating Multi-worker ("
01317             << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01318     bigpool=wkf_threadpool_create(workercount, devlist);
01319     delete [] devlist;
01320 
01321     
01322     if (getenv("VMDCUDAVERBOSE") != NULL)
01323       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01324     else
01325       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01326 
01327     
01328     wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01329 
01330     
01331     devpool = bigpool;
01332   }
01333 
01334   
01335   wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01336 
01337   wkf_threadpool_sched_dynamic(devpool, &tile);
01338   wkf_timer_start(timer);
01339   wkf_threadpool_launch(devpool, gpu_ooc_bench_thread, &parms, 1);
01340   wkf_timer_stop(timer);
01341 
01342   double runtime = wkf_timer_time(timer);
01343   double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01344 
01345   printf("gpu_ooc_bench) natoms: %d, fsz: %ld, tsz: %ld\n",
01346          sel->num_atoms, sel->num_atoms * 12L,
01347          sel->num_atoms * 12L * (tile.end+1));
01348 
01349   int pinhostiobuffer = 1;
01350   if (getenv("VMDGDSHOSTNOPIN"))
01351     pinhostiobuffer=0;
01352 
01353   int hostusedirectio = 1;
01354   if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01355     hostusedirectio = 0;
01356 
01357   int usecufile=1;
01358   if (getenv("VMDGDSUSEHOST"))
01359     usecufile=0;
01360 
01361   if (usecufile) {
01362     printf("OOC I/O via GDS + cuFile\n");
01363   } else {
01364     printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01365            (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01366            (pinhostiobuffer) ? "pinned" : "unpinned");
01367   }
01368 
01369   int multiframeio = 0;
01370   if (getenv("VMDGDSMULTIFRAME"))
01371     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01372   if (multiframeio > VMDGDSMAXFRAMEBUF)
01373     multiframeio = VMDGDSMAXFRAMEBUF;
01374   if (multiframeio) {
01375     printf("GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01376            multiframeio,
01377            multiframeio * sel->num_atoms * 12L);
01378   }
01379 
01380   printf("OOC runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01381 
01382   for (i=0; i<nfiles; i++) {
01383 #if defined(VMDUSECUDAGDS)
01384     cuFileHandleDeregister(cfh[i]);
01385 #endif
01386     close_js_read(jshandles[i]);
01387   }
01388 #endif
01389 
01390 #if defined(VMDUSECUDAGDS)
01391   if (cfh != NULL)
01392     free(cfh);
01393 
01394   if (cfhdesc != NULL)
01395      free(cfhdesc);
01396 
01397   if (jshandles != NULL)
01398     free(jshandles);
01399 
01400   if (directio_fds != NULL)
01401     free(directio_fds);
01402 #endif
01403 
01404   
01405   
01406   if (bigpool != NULL)
01407     wkf_threadpool_destroy(bigpool);
01408 
01409   return 0;
01410 } 
01411 
01412 
01413