CUDABench.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDABench.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.42 $      $Date: 2022/02/09 04:03:19 $
00014  *
00015  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "Inform.h"
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h" 
00030 #include "Measure.h"
00031 
00032 
00033 //
00034 // Restrict macro to make it easy to do perf tuning tests
00035 //
00036 #if 1
00037 #define RESTRICT __restrict__
00038 #else
00039 #define RESTRICT
00040 #endif
00041 
00042 
00043 //#define VMDUSECUDAGDS 1
00044 #if defined(VMDUSECUDAGDS)
00045 #include </usr/local/gds-beta-0.7.1/lib/cufile.h>        // GPU-Direct Storage
00046 
00047 // direct calls to JS plugin for devel/testing until the plugin manager
00048 // and headers incorporate the new out-of-core GPU-direct I/O hooks.
00049 #define VMDJSPLUGININCLUDESRC 1
00050 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00051 #endif
00052 
00053 
00054 #define CUERR { cudaError_t err; \
00055   if ((err = cudaGetLastError()) != cudaSuccess) { \
00056   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00057   return -1; }}
00058 
00059 
00060 //
00061 // Benchmark peak Multiply-Add instruction performance, in GFLOPS
00062 //
00063 
00064 // FMADD16 macro contains a sequence of operations that the compiler
00065 // won't optimize out, and will translate into a densely packed block
00066 // of multiply-add instructions with no intervening register copies/moves
00067 // or other instructions. 
00068 #define FMADD16 \
00069     tmp0  = tmp0*tmp4+tmp7;     \
00070     tmp1  = tmp1*tmp5+tmp0;     \
00071     tmp2  = tmp2*tmp6+tmp1;     \
00072     tmp3  = tmp3*tmp7+tmp2;     \
00073     tmp4  = tmp4*tmp0+tmp3;     \
00074     tmp5  = tmp5*tmp1+tmp4;     \
00075     tmp6  = tmp6*tmp2+tmp5;     \
00076     tmp7  = tmp7*tmp3+tmp6;     \
00077     tmp8  = tmp8*tmp12+tmp15;   \
00078     tmp9  = tmp9*tmp13+tmp8;    \
00079     tmp10 = tmp10*tmp14+tmp9;   \
00080     tmp11 = tmp11*tmp15+tmp10;  \
00081     tmp12 = tmp12*tmp8+tmp11;   \
00082     tmp13 = tmp13*tmp9+tmp12;   \
00083     tmp14 = tmp14*tmp10+tmp13;  \
00084     tmp15 = tmp15*tmp11+tmp14;
00085 
00086 // CUDA grid, thread block, loop, and MADD operation counts
00087 #define GRIDSIZEX       6144  // number of 1-D thread blocks
00088 #define BLOCKSIZEX      64    // number of threads per 1-D block
00089 #define GLOOPS          2000  // iteration count (all threads)
00090 #define FMADD16COUNT    32    // 32 reps
00091 #define FLOPSPERFMADD16 32    // 16 MULs and 16 ADDs
00092 
00093 // FLOP counting
00094 #define FLOPSPERLOOP (FMADD16COUNT * FLOPSPERFMADD16)
00095 
00096 __global__ static void madd_kernel(float *doutput) {
00097   int tid = blockIdx.x * blockDim.x + threadIdx.x;
00098   float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00099   float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00100   tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00101   tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00102 
00103   tmp15=tmp7 = blockIdx.x * 0.001f; // prevent compiler from optimizing out
00104   tmp1 = blockIdx.y * 0.001f;       // the body of the loop...
00105 
00106   int loop;
00107   for(loop=0; loop<GLOOPS; loop++){
00108     FMADD16
00109     FMADD16
00110     FMADD16
00111     FMADD16
00112     FMADD16
00113     FMADD16
00114     FMADD16
00115     FMADD16
00116     FMADD16
00117     FMADD16
00118     FMADD16
00119     FMADD16
00120     FMADD16
00121     FMADD16
00122     FMADD16
00123     FMADD16
00124     FMADD16
00125     FMADD16
00126     FMADD16
00127     FMADD16
00128     FMADD16
00129     FMADD16
00130     FMADD16
00131     FMADD16
00132     FMADD16
00133     FMADD16
00134     FMADD16
00135     FMADD16
00136     FMADD16
00137     FMADD16
00138     FMADD16
00139     FMADD16
00140   }
00141 
00142   doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00143                  +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00144 }
00145 
00146 
00147 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00148   float *doutput = NULL;
00149   dim3 Bsz, Gsz;
00150   wkf_timerhandle timer;
00151   int i;
00152 
00153   cudaError_t rc;
00154   rc = cudaSetDevice(cudadev);
00155   if (rc != cudaSuccess) {
00156 #if CUDART_VERSION >= 2010
00157     rc = cudaGetLastError(); // query last error and reset error state
00158     if (rc != cudaErrorSetOnActiveProcess)
00159       return -1; // abort and return an error
00160 #else
00161     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00162                         // revs don't have a cudaErrorSetOnActiveProcess enum
00163 #endif
00164   }
00165 
00166 
00167   // setup CUDA grid and block sizes
00168   Bsz.x = BLOCKSIZEX;
00169   Bsz.y = 1;
00170   Bsz.z = 1;
00171   Gsz.x = GRIDSIZEX;
00172   Gsz.y = 1;
00173   Gsz.z = 1;
00174 
00175   // allocate output array
00176   cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00177   CUERR // check and clear any existing errors
00178 
00179   // warmup run
00180   madd_kernel<<<Gsz, Bsz>>>(doutput);
00181   cudaDeviceSynchronize(); // wait for kernel to finish
00182 
00183   // benchmark run
00184   timer=wkf_timer_create();
00185   wkf_timer_start(timer);
00186   for (i=0; i<testloops; i++) { 
00187     madd_kernel<<<Gsz, Bsz>>>(doutput);
00188   }
00189   cudaDeviceSynchronize(); // wait for kernel to finish
00190   CUERR // check and clear any existing errors
00191   wkf_timer_stop(timer);
00192 
00193   double runtime = wkf_timer_time(timer);
00194   double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00195                   ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00196   
00197   *gflops = gflop / runtime;
00198 
00199   cudaFree(doutput);
00200   CUERR // check and clear any existing errors
00201 
00202   wkf_timer_destroy(timer);
00203 
00204   return 0;
00205 }
00206 
00207 typedef struct {
00208   int deviceid;
00209   int testloops;
00210   double gflops;
00211 } maddthrparms;
00212 
00213 static void * cudamaddthread(void *voidparms) {
00214   maddthrparms *parms = (maddthrparms *) voidparms;
00215   cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00216   return NULL;
00217 }
00218 
00219 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00220                          int testloops) {
00221   maddthrparms *parms;
00222   wkf_thread_t * threads;
00223   int i;
00224 
00225   /* allocate array of threads */
00226   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00227 
00228   /* allocate and initialize array of thread parameters */
00229   parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00230   for (i=0; i<numdevs; i++) {
00231     if (devlist != NULL)
00232       parms[i].deviceid = devlist[i];
00233     else
00234       parms[i].deviceid = i;
00235 
00236     parms[i].testloops = testloops;
00237     parms[i].gflops = 0.0;
00238   }
00239 
00240 #if defined(VMDTHREADS)
00241   /* spawn child threads to do the work */
00242   /* thread 0 must also be processed this way otherwise    */
00243   /* we'll permanently bind the main thread to some device */
00244   for (i=0; i<numdevs; i++) {
00245     wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00246   }
00247 
00248   /* join the threads after work is done */
00249   for (i=0; i<numdevs; i++) {
00250     wkf_thread_join(threads[i], NULL);
00251   }
00252 #else
00253   /* single thread does all of the work */
00254   cudamaddthread((void *) &parms[0]);
00255 #endif
00256 
00257   for (i=0; i<numdevs; i++) {
00258     gflops[i] = parms[i].gflops; 
00259   }
00260 
00261   /* free thread parms */
00262   free(parms);
00263   free(threads);
00264 
00265   return 0;
00266 }
00267 
00268 
00269 
00270 
00271 
00272 
00273 //
00274 // Host-GPU memcpy I/O bandwidth benchmark
00275 //
00276 
00277 #define BWITER 500
00278 #define LATENCYITER 50000
00279 
00280 static int cudabusbw(int cudadev, 
00281                      double *hdmbsec, double *hdlatusec, 
00282                      double *phdmbsec, double *phdlatusec, 
00283                      double *dhmbsec, double *dhlatusec,
00284                      double *pdhmbsec, double *pdhlatusec) {
00285   float *hdata = NULL;   // non-pinned DMA buffer
00286   float *phdata = NULL;  // pinned DMA buffer
00287   float *ddata = NULL;
00288   int i;
00289   double runtime;
00290   wkf_timerhandle timer;
00291   int memsz = 1024 * 1024 * sizeof(float);
00292 
00293   *hdmbsec = 0.0;
00294   *hdlatusec = 0.0;
00295   *dhmbsec = 0.0;
00296   *dhlatusec = 0.0;
00297   *phdmbsec = 0.0;
00298   *phdlatusec = 0.0;
00299   *pdhmbsec = 0.0;
00300   *pdhlatusec = 0.0;
00301 
00302   // attach to the selected device
00303   cudaError_t rc;
00304   rc = cudaSetDevice(cudadev);
00305   if (rc != cudaSuccess) {
00306 #if CUDART_VERSION >= 2010
00307     rc = cudaGetLastError(); // query last error and reset error state
00308     if (rc != cudaErrorSetOnActiveProcess)
00309       return -1; // abort and return an error
00310 #else
00311     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00312                         // revs don't have a cudaErrorSetOnActiveProcess enum
00313 #endif
00314   }
00315 
00316   // allocate non-pinned output array
00317   hdata = (float *) malloc(memsz); 
00318 
00319   // allocate pinned output array
00320   cudaMallocHost((void**) &phdata, memsz);
00321   CUERR // check and clear any existing errors
00322 
00323   // allocate device memory
00324   cudaMalloc((void**) &ddata, memsz);
00325   CUERR // check and clear any existing errors
00326 
00327   // create timer
00328   timer=wkf_timer_create();
00329 
00330   //
00331   // Host to device timings
00332   //
00333 
00334   // non-pinned bandwidth
00335   wkf_timer_start(timer);
00336   for (i=0; i<BWITER; i++) {
00337     cudaMemcpy(ddata, hdata, memsz,  cudaMemcpyHostToDevice);
00338   }
00339   wkf_timer_stop(timer);
00340   CUERR // check and clear any existing errors
00341   runtime = wkf_timer_time(timer);
00342   *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00343 
00344   // non-pinned latency
00345   wkf_timer_start(timer);
00346   for (i=0; i<LATENCYITER; i++) {
00347     cudaMemcpy(ddata, hdata, 1,  cudaMemcpyHostToDevice);
00348   }
00349   wkf_timer_stop(timer);
00350   CUERR // check and clear any existing errors
00351   runtime = wkf_timer_time(timer);
00352   *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00353 
00354 
00355   // pinned bandwidth
00356   wkf_timer_start(timer);
00357   for (i=0; i<BWITER; i++) {
00358     cudaMemcpy(ddata, phdata, memsz,  cudaMemcpyHostToDevice);
00359   }
00360   wkf_timer_stop(timer);
00361   CUERR // check and clear any existing errors
00362   runtime = wkf_timer_time(timer);
00363   *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00364 
00365   // pinned latency
00366   wkf_timer_start(timer);
00367   for (i=0; i<LATENCYITER; i++) {
00368     cudaMemcpy(ddata, phdata, 1,  cudaMemcpyHostToDevice);
00369   }
00370   wkf_timer_stop(timer);
00371   CUERR // check and clear any existing errors
00372   runtime = wkf_timer_time(timer);
00373   *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00374 
00375  
00376   //
00377   // Device to host timings
00378   //
00379 
00380   // non-pinned bandwidth
00381   wkf_timer_start(timer);
00382   for (i=0; i<BWITER; i++) {
00383     cudaMemcpy(hdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00384   }
00385   wkf_timer_stop(timer);
00386   CUERR // check and clear any existing errors
00387   runtime = wkf_timer_time(timer);
00388   *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00389 
00390   // non-pinned latency
00391   wkf_timer_start(timer);
00392   for (i=0; i<LATENCYITER; i++) {
00393     cudaMemcpy(hdata, ddata, 1,  cudaMemcpyDeviceToHost);
00394   }
00395   wkf_timer_stop(timer);
00396   CUERR // check and clear any existing errors
00397   runtime = wkf_timer_time(timer);
00398   *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00399 
00400 
00401   // pinned bandwidth
00402   wkf_timer_start(timer);
00403   for (i=0; i<BWITER; i++) {
00404     cudaMemcpy(phdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00405   }
00406   wkf_timer_stop(timer);
00407   CUERR // check and clear any existing errors
00408   runtime = wkf_timer_time(timer);
00409   *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00410 
00411   // pinned latency
00412   wkf_timer_start(timer);
00413   for (i=0; i<LATENCYITER; i++) {
00414     cudaMemcpy(phdata, ddata, 1,  cudaMemcpyDeviceToHost);
00415   }
00416   wkf_timer_stop(timer);
00417   CUERR // check and clear any existing errors
00418   runtime = wkf_timer_time(timer);
00419   *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00420  
00421  
00422   cudaFree(ddata);
00423   CUERR // check and clear any existing errors
00424   cudaFreeHost(phdata);
00425   CUERR // check and clear any existing errors
00426   free(hdata);
00427 
00428   wkf_timer_destroy(timer);
00429 
00430   return 0;
00431 }
00432 
00433 typedef struct {
00434   int deviceid;
00435   double hdmbsec;
00436   double hdlatusec;
00437   double phdmbsec;
00438   double phdlatusec;
00439   double dhmbsec;
00440   double dhlatusec;
00441   double pdhmbsec;
00442   double pdhlatusec;
00443 } busbwthrparms;
00444 
00445 static void * cudabusbwthread(void *voidparms) {
00446   busbwthrparms *parms = (busbwthrparms *) voidparms;
00447   cudabusbw(parms->deviceid, 
00448             &parms->hdmbsec, &parms->hdlatusec,
00449             &parms->phdmbsec, &parms->phdlatusec,
00450             &parms->dhmbsec, &parms->dhlatusec,
00451             &parms->pdhmbsec, &parms->pdhlatusec);
00452   return NULL;
00453 }
00454 
00455 int vmd_cuda_bus_bw(int numdevs, int *devlist, 
00456                     double *hdmbsec, double *hdlatusec,
00457                     double *phdmbsec,double *phdlatusec,
00458                     double *dhmbsec, double *dhlatusec,
00459                     double *pdhmbsec, double *pdhlatusec) {
00460   busbwthrparms *parms;
00461   wkf_thread_t * threads;
00462   int i;
00463 
00464   /* allocate array of threads */
00465   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00466 
00467   /* allocate and initialize array of thread parameters */
00468   parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00469   for (i=0; i<numdevs; i++) {
00470     if (devlist != NULL)
00471       parms[i].deviceid = devlist[i];
00472     else
00473       parms[i].deviceid = i;
00474     parms[i].hdmbsec = 0.0;
00475     parms[i].hdlatusec = 0.0;
00476     parms[i].phdmbsec = 0.0;
00477     parms[i].phdlatusec = 0.0;
00478     parms[i].dhmbsec = 0.0;
00479     parms[i].dhlatusec = 0.0;
00480     parms[i].pdhmbsec = 0.0;
00481     parms[i].pdhlatusec = 0.0;
00482   }
00483 
00484 #if defined(VMDTHREADS)
00485   /* spawn child threads to do the work */
00486   /* thread 0 must also be processed this way otherwise    */
00487   /* we'll permanently bind the main thread to some device */
00488   for (i=0; i<numdevs; i++) {
00489     wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00490   }
00491 
00492   /* join the threads after work is done */
00493   for (i=0; i<numdevs; i++) {
00494     wkf_thread_join(threads[i], NULL);
00495   }
00496 #else
00497   /* single thread does all of the work */
00498   cudabusbwthread((void *) &parms[0]);
00499 #endif
00500 
00501   for (i=0; i<numdevs; i++) {
00502     hdmbsec[i] = parms[i].hdmbsec; 
00503     hdlatusec[i] = parms[i].hdlatusec; 
00504     phdmbsec[i] = parms[i].phdmbsec; 
00505     phdlatusec[i] = parms[i].phdlatusec; 
00506     dhmbsec[i] = parms[i].dhmbsec; 
00507     dhlatusec[i] = parms[i].dhlatusec; 
00508     pdhmbsec[i] = parms[i].pdhmbsec; 
00509     pdhlatusec[i] = parms[i].pdhlatusec; 
00510   }
00511 
00512   /* free thread parms */
00513   free(parms);
00514   free(threads);
00515 
00516   return 0;
00517 }
00518 
00519 
00520 
00521 //
00522 // GPU device global memory bandwidth benchmark
00523 //
00524 template <class T>
00525 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00526   const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00527   dest[idx] = src[idx];
00528 }
00529 
00530 template <class T>
00531 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00532   int idx = threadIdx.x + blockIdx.x * blockDim.x;
00533   dest[idx] = val;
00534 }
00535 
00536 typedef float4 datatype;
00537 
00538 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00539   int i;
00540   int len = 1 << 22; // one thread per data element
00541   int loops = 500;
00542   datatype *src, *dest;
00543   datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00544 
00545   // initialize to zero for starters
00546   float memsettime = 0.0f;
00547   float memcpytime = 0.0f;
00548   *gpumemsetgbsec = 0.0;
00549   *gpumemcpygbsec = 0.0;
00550 
00551   // attach to the selected device
00552   cudaError_t rc;
00553   rc = cudaSetDevice(cudadev);
00554   if (rc != cudaSuccess) {
00555 #if CUDART_VERSION >= 2010
00556     rc = cudaGetLastError(); // query last error and reset error state
00557     if (rc != cudaErrorSetOnActiveProcess)
00558       return -1; // abort and return an error
00559 #else
00560     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00561                         // revs don't have a cudaErrorSetOnActiveProcess enum
00562 #endif
00563   }
00564 
00565   cudaMalloc((void **) &src, sizeof(datatype)*len);
00566   CUERR
00567   cudaMalloc((void **) &dest, sizeof(datatype)*len);
00568   CUERR
00569 
00570   dim3 BSz(256, 1, 1);
00571   dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 
00572 
00573   // do a warm-up pass
00574   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00575   CUERR
00576   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00577   CUERR
00578   gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00579   CUERR
00580 
00581   cudaEvent_t start, end;
00582   cudaEventCreate(&start);
00583   cudaEventCreate(&end);
00584 
00585   // execute the memset kernel
00586   cudaEventRecord(start, 0);
00587   for (i=0; i<loops; i++) {
00588     gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00589   }
00590   CUERR
00591   cudaEventRecord(end, 0);
00592   CUERR
00593   cudaEventSynchronize(start);
00594   CUERR
00595   cudaEventSynchronize(end);
00596   CUERR
00597   cudaEventElapsedTime(&memsettime, start, end);
00598   CUERR
00599 
00600   // execute the memcpy kernel
00601   cudaEventRecord(start, 0);
00602   for (i=0; i<loops; i++) {
00603     gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00604   }
00605   cudaEventRecord(end, 0);
00606   CUERR
00607   cudaEventSynchronize(start);
00608   CUERR
00609   cudaEventSynchronize(end);
00610   CUERR
00611   cudaEventElapsedTime(&memcpytime, start, end);
00612   CUERR
00613 
00614   cudaEventDestroy(start);
00615   CUERR
00616   cudaEventDestroy(end);
00617   CUERR
00618 
00619   *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00620   *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00621   cudaFree(dest);
00622   cudaFree(src);
00623   CUERR
00624 
00625   return 0;
00626 }
00627 
00628 typedef struct {
00629   int deviceid;
00630   double memsetgbsec;
00631   double memcpygbsec;
00632 } globmembwthrparms;
00633 
00634 static void * cudaglobmembwthread(void *voidparms) {
00635   globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00636   cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00637   return NULL;
00638 }
00639 
00640 int vmd_cuda_globmem_bw(int numdevs, int *devlist, 
00641                         double *memsetgbsec, double *memcpygbsec) {
00642   globmembwthrparms *parms;
00643   wkf_thread_t * threads;
00644   int i;
00645 
00646   /* allocate array of threads */
00647   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00648 
00649   /* allocate and initialize array of thread parameters */
00650   parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00651   for (i=0; i<numdevs; i++) {
00652     if (devlist != NULL)
00653       parms[i].deviceid = devlist[i];
00654     else
00655       parms[i].deviceid = i;
00656     parms[i].memsetgbsec = 0.0;
00657     parms[i].memcpygbsec = 0.0;
00658   }
00659 
00660 #if defined(VMDTHREADS)
00661   /* spawn child threads to do the work */
00662   /* thread 0 must also be processed this way otherwise    */
00663   /* we'll permanently bind the main thread to some device */
00664   for (i=0; i<numdevs; i++) {
00665     wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00666   }
00667 
00668   /* join the threads after work is done */
00669   for (i=0; i<numdevs; i++) {
00670     wkf_thread_join(threads[i], NULL);
00671   }
00672 #else
00673   /* single thread does all of the work */
00674   cudaglobmembwthread((void *) &parms[0]);
00675 #endif
00676 
00677   for (i=0; i<numdevs; i++) {
00678     memsetgbsec[i] = parms[i].memsetgbsec;
00679     memcpygbsec[i] = parms[i].memcpygbsec;
00680   }
00681 
00682   /* free thread parms */
00683   free(parms);
00684   free(threads);
00685 
00686   return 0;
00687 }
00688 
00689 
00690 //
00691 // Benchmark latency for complete threadpool barrier wakeup/run/sleep cycle
00692 //
00693 static void * vmddevpoollatencythread(void *voidparms) {
00694   return NULL;
00695 }
00696 
00697 static void * vmddevpooltilelatencythread(void *voidparms) {
00698   int threadid=-1;
00699   int tilesize=1;
00700   void *parms=NULL;
00701   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00702   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00703 
00704   // grind through task tiles until none are left
00705   wkf_tasktile_t tile;
00706   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00707     // do nothing but eat work units...
00708   }
00709 
00710   return NULL;
00711 }
00712 
00713 
00714 // no-op kernel for timing kernel launches
00715 __global__ static void nopkernel(float * ddata) {
00716   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
00717   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00718   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00719 
00720   if (ddata != NULL)
00721     ddata[outaddr] = outaddr;
00722 }
00723 
00724 // empty kernel for timing kernel launches
00725 __global__ static void voidkernel(void) {
00726   return;
00727 }
00728 
00729 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00730   int threadid=-1;
00731   int tilesize=1;
00732   float *parms=NULL;
00733   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00734 
00735   // XXX Note that we expect parms to be set to NULL or a valid CUDA
00736   //     global memory pointer for correct operation of the NOP kernel below
00737   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00738 
00739 #if 0
00740   // scale tile size by device performance
00741   tilesize=4; // GTX 280, Tesla C1060 starting point tile size
00742   wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00743 #endif
00744 
00745   // grind through task tiles until none are left
00746   wkf_tasktile_t tile;
00747   dim3 Gsz(1,1,0);
00748   dim3 Bsz(8,8,1);
00749   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00750     // launch a no-op CUDA kernel
00751     nopkernel<<<Gsz, Bsz, 0>>>(parms);
00752   }
00753 
00754   // wait for all GPU kernels to complete
00755   cudaDeviceSynchronize();
00756 
00757   return NULL;
00758 }
00759 
00760 
00761 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00762                              double *kernlaunchlatency,
00763                              double *barlatency,
00764                              double *cyclelatency, 
00765                              double *tilelatency,
00766                              double *kernellatency) {
00767   int i;
00768   wkf_tasktile_t tile;
00769   wkf_timerhandle timer;
00770   int loopcount;
00771 
00772   timer=wkf_timer_create();
00773 
00774   // execute just a CUDA kernel launch and measure latency on whatever
00775   // GPU we get.
00776   loopcount = 15000;
00777   dim3 VGsz(1,1,0);
00778   dim3 VBsz(8,8,1);
00779   wkf_timer_start(timer);
00780   for (i=0; i<loopcount; i++) {
00781     voidkernel<<<VGsz, VBsz, 0>>>();
00782   }
00783   // wait for GPU kernels to complete
00784   cudaDeviceSynchronize();
00785   wkf_timer_stop(timer);
00786   *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00787 
00788   // execute just a raw barrier sync and measure latency
00789   loopcount = 15000;
00790   wkf_timer_start(timer);
00791   for (i=0; i<loopcount; i++) {
00792     wkf_threadpool_wait(devpool);
00793   }
00794   wkf_timer_stop(timer);
00795   *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00796 
00797   // time wake-up, launch, and sleep/join of device pool doing a no-op
00798   loopcount = 5000;
00799   wkf_timer_start(timer);
00800   for (i=0; i<loopcount; i++) {
00801     tile.start=0;
00802     tile.end=0;
00803     wkf_threadpool_sched_dynamic(devpool, &tile);
00804     wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00805   }
00806   wkf_timer_stop(timer);
00807   *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00808 
00809   // time wake-up, launch, and sleep/join of device pool eating tiles
00810   loopcount = 5000;
00811   wkf_timer_start(timer);
00812   for (i=0; i<loopcount; i++) {
00813     tile.start=0;
00814     tile.end=tilesize;
00815     wkf_threadpool_sched_dynamic(devpool, &tile);
00816     wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00817   }
00818   wkf_timer_stop(timer);
00819   *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00820 
00821   // time wake-up, launch, and sleep/join of device pool eating tiles
00822   loopcount = 2000;
00823   wkf_timer_start(timer);
00824   for (i=0; i<loopcount; i++) {
00825     tile.start=0;
00826     tile.end=tilesize;
00827     wkf_threadpool_sched_dynamic(devpool, &tile);
00828     wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00829   }
00830   wkf_timer_stop(timer);
00831   *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00832 
00833   wkf_timer_destroy(timer);
00834 
00835 #if 1
00836   vmd_cuda_measure_latencies(devpool);
00837 #endif
00838 
00839   return 0;
00840 }
00841 
00842 
00843 //
00844 // Benchmark CUDA kernel launch and memory copy latencies in isolation
00845 //
00846 typedef struct {
00847   int deviceid;
00848   int testloops;
00849   double kernlatency;
00850   double bcopylatency;
00851   double kbseqlatency;
00852 } latthrparms;
00853 
00854 static void * vmddevpoolcudalatencythread(void *voidparms) {
00855   int threadid=-1;
00856   latthrparms *parms=NULL;
00857 
00858   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00859   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00860   if (parms->deviceid == threadid) { 
00861     wkf_timerhandle timer;
00862     timer=wkf_timer_create();
00863     printf("Thread/device %d running...\n", threadid);
00864     cudaStream_t devstream;
00865     cudaStreamCreate(&devstream);
00866 
00867     char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00868     char  *gpubuf = NULL;
00869     cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00870 
00871     dim3 Gsz(1,1,0);
00872     dim3 Bsz(8,8,1);
00873 
00874     // measure back-to-back NULL kernel launches
00875     wkf_timer_start(timer);
00876     int i;
00877     for (i=0; i<parms->testloops; i++) {
00878       // launch a no-op CUDA kernel
00879       nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00880     }
00881     // wait for all GPU kernels to complete
00882     cudaStreamSynchronize(devstream);
00883     wkf_timer_stop(timer);
00884     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00885 
00886     // measure back-to-back round-trip 1-byte memcpy latencies
00887     wkf_timer_start(timer);
00888     for (i=0; i<parms->testloops; i++) {
00889       cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00890       cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00891     }
00892     // wait for all GPU kernels to complete
00893     cudaStreamSynchronize(devstream);
00894     wkf_timer_stop(timer);
00895     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00896 
00897     printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00898 
00899     cudaStreamDestroy(devstream);
00900     cudaFree(gpubuf);
00901     free(hostbuf);
00902     wkf_timer_destroy(timer);
00903   }
00904 
00905   return NULL;
00906 }
00907 
00908 
00909 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00910   latthrparms thrparms;
00911   int workers = wkf_threadpool_get_workercount(devpool);
00912   int i;
00913 printf("vmd_cuda_measure_latencies()...\n");
00914   for (i=0; i<workers; i++) {
00915     memset(&thrparms, 0, sizeof(thrparms));
00916     thrparms.deviceid = i;
00917     thrparms.testloops = 2500;
00918     wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00919   }
00920 
00921   return 0;
00922 }
00923 
00924 
00925 #if defined(VMDUSECUDAGDS)
00926 typedef struct {
00927   int nfiles;
00928   const char **trjfileset;
00929   jshandle **jshandles;
00930   CUfileHandle_t *cfh;
00931   int devcount;
00932   int natoms;
00933   const AtomSel *sel;
00934   int first;
00935   int last;
00936   int step;
00937 } gpuoocbenchthreadparms;
00938 
00939 #define VMDGDSMAXFRAMEBUF     8
00940 static void * gpu_ooc_bench_thread(void *voidparms) {
00941   int threadid, numthreads;
00942   gpuoocbenchthreadparms *parms = NULL;
00943   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00944   wkf_threadpool_worker_getid(voidparms, &threadid, &numthreads);
00945 
00946   //
00947   // copy in per-thread parameters
00948   //
00949   int nfiles = parms->nfiles;
00950   int natoms = parms->natoms;
00951   const AtomSel *sel = parms->sel;
00952   int first  = parms->first;
00953   int last   = parms->last;
00954   int step   = parms->step;
00955 
00956   int usecufile = 1;
00957   fio_fd *hostfds = NULL;
00958   int pinhostiobuffer = 1;
00959   if (getenv("VMDGDSHOSTNOPIN")) {
00960     pinhostiobuffer=0;
00961   }
00962 
00963   if (getenv("VMDGDSUSEHOST")) {
00964     usecufile=0;
00965     hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00966 
00967     int hostusedirectio = 1;
00968     if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00969       hostusedirectio = 0;
00970 
00971     int openmode = FIO_READ;
00972     if (hostusedirectio)
00973       openmode |= FIO_DIRECT;
00974 
00975     int i;
00976     for (i=0; i<nfiles; i++) {
00977       if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00978         if (hostusedirectio) {
00979           printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n",
00980                  threadid, parms->trjfileset[i]);
00981         } else {
00982           printf("Thr[%d] can't open file '%s'\n",
00983                  threadid, parms->trjfileset[i]);
00984         }
00985         return NULL;
00986       }
00987     }
00988   }
00989 
00990   if (hostfds && usecufile) {
00991     printf("Inconsistent cufile/hostfds state, aborting!\n");
00992     return NULL;
00993   }
00994 
00995   /* ensure we have a large enough allocation so we can align */
00996   /* the starting pointer to a blocksz page boundary          */
00997   long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00998   long sz = 3L*sizeof(float)*natoms + blocksz;
00999 
01000   /* pad the allocation to an even multiple of the block size */
01001   size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
01002 
01003   int framecount = (last - first + 1) / step;
01004   int framesperfile = framecount / nfiles;
01005 
01006   if (threadid == 0) {
01007     printf("Thr[%2d] %d frames total, natoms: %d selected: %d\n",
01008            threadid, framecount, natoms, sel->selected);
01009     printf("Thr[%2d] %d frames/file\n", threadid, framesperfile);
01010   }
01011 
01012   cudaError_t crc;
01013   cudaStream_t oocstream;
01014   float *devptr=NULL;
01015   float *hostptr=NULL;
01016   float *hostptr_unaligned=NULL;
01017 
01018   float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
01019   float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
01020   int multiframeio = 0;
01021   if (getenv("VMDGDSMULTIFRAME"))
01022     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01023   if (multiframeio > VMDGDSMAXFRAMEBUF)
01024     multiframeio = VMDGDSMAXFRAMEBUF;
01025 
01026   // set block sizes and counts for IO bench calcs
01027   dim3 IOBsz = dim3(256, 1, 1);
01028   dim3 IOGsz = dim3((natoms + IOBsz.x - 1) / IOBsz.x, 1, 1);
01029 
01030   if (parms->devcount > 0) {
01031     long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
01032 
01033     if (threadid == 0) {
01034       printf("Thr[%2d] Allocating GPU timestep I/O buf: %ld \n",
01035              threadid, gpuallocsz);
01036     }
01037     crc = cudaMalloc((void**) &devptr, gpuallocsz);
01038 
01039     if (hostfds != NULL) {
01040       if (pinhostiobuffer) {
01041         crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
01042       } else {
01043         hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096,
01044                                               (void**) &hostptr_unaligned);
01045         if (!hostptr) {
01046           printf("Thr[%d]: Failed allocation!\n", threadid);
01047           return NULL;
01048         }
01049       }
01050     }
01051 
01052     long crdsz = sel->selected * sizeof(float);
01053 
01054     // atomic coord buffers
01055     crc = cudaMalloc((void**) &crdx1, crdsz);
01056     crc = cudaMalloc((void**) &crdy1, crdsz);
01057     crc = cudaMalloc((void**) &crdz1, crdsz);
01058     crc = cudaMalloc((void**) &crdx2, crdsz);
01059     crc = cudaMalloc((void**) &crdy2, crdsz);
01060     crc = cudaMalloc((void**) &crdz2, crdsz);
01061     if (crc != cudaSuccess) {
01062       printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
01063       return NULL; // XXX error handling needs to be done here
01064     }
01065 
01066     cudaStreamCreate(&oocstream);
01067 
01068 #if defined(VMDUSECUDAGDS)
01069     cuFileBufRegister(devptr, gpuallocsz, 0);
01070 #endif
01071   }
01072 
01073   int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
01074   
01075   int filestrategy = 0;
01076   if (getenv("VMDGDSFILESTRATEGY")) {
01077     filestrategy = atoi(getenv("VMDGDSFILESTRATEGY"));
01078   }
01079   if (threadid == 0) {
01080     printf("Thr[%2d] file strategy set to: %d\n", threadid, filestrategy);
01081   }
01082 
01083   wkf_tasktile_t tile;
01084   while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
01085     //
01086     // simple I/O + compute benchmarking...
01087     //
01088     int idx;
01089     int threadspergpu;
01090     if (parms->devcount > 0)
01091       threadspergpu = numthreads / parms->devcount;
01092     else 
01093       threadspergpu = 1;
01094 
01095     for (idx=tile.start; idx<tile.end; idx++) {
01096       int myfileidx, fileframeidx;
01097 
01098       switch (filestrategy) {
01099         case 1:
01100           myfileidx = (idx / multiframeio) % nfiles;
01101           fileframeidx = idx % framesperfile;
01102           break;
01103 
01104         case 2:
01105           myfileidx = (idx / (multiframeio * threadspergpu)) % nfiles;
01106           fileframeidx = idx % framesperfile;
01107           break;
01108 
01109         case 3:
01110           myfileidx = (threadid / 4) % nfiles;
01111           fileframeidx = idx % framesperfile;
01112           break;
01113 
01114         case 0:
01115         default:
01116           myfileidx = (threadid / threadspergpu) % nfiles;
01117           fileframeidx = idx % framesperfile;
01118           break;
01119       }
01120 
01121       //
01122       // compute multi-frame or single-frame I/O offsets and sizes
01123       //
01124       long startoffset, foffset, readlen;
01125       read_js_timestep_index_offsets(parms->jshandles[myfileidx],
01126                                      natoms, fileframeidx, 0, natoms, NULL,
01127                                      &startoffset, &foffset, &readlen);
01128       if (multiframeio) {
01129         // multi-frame reads use the same starting offset, but the 
01130         // read length is computed from the first and last frames 
01131         // in the group.
01132         long multistartoffset, multifoffset, multireadlen;
01133         read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
01134                                        fileframeidx+multiframeio-1,
01135                                        0, natoms, NULL,
01136                                        &multistartoffset, &multifoffset,
01137                                        &multireadlen);
01138 
01139         multireadlen = (multifoffset + multireadlen) - foffset;
01140 
01141         //printf("** readlen: %ld  multireadlen: %ld\n", readlen, multireadlen);
01142         readlen = multireadlen;
01143         idx+=multiframeio-1; // add in the required increment...
01144       }
01145 
01146       //
01147       // perform the required I/O via GDS or by host kernel I/O
01148       //
01149       long ret=0;
01150       if (usecufile) {
01151         ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
01152       } else if (hostfds) {
01153         foffset=0;
01154         ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
01155         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01156         ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
01157         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01158         cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
01159       } else {
01160         printf("Inconsistent cufile/hostfds state, aborting!\n");
01161         return NULL;
01162       }
01163 
01164       // handle errors if they have occured
01165       if (ret < 0) {
01166         printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
01167         return NULL; // XXX error handling needs to be done here
01168       }
01169 
01170       if (verbose) {
01171         printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d cuFile len: %ld off: %ld\n",
01172                threadid, myfileidx, tile.start, tile.end, idx, 
01173                readlen, foffset);
01174       }
01175     }
01176   }
01177 
01178   cudaFree(crdx1);
01179   cudaFree(crdy1);
01180   cudaFree(crdz1);
01181   cudaFree(crdx2);
01182   cudaFree(crdy2);
01183   cudaFree(crdz2);
01184 
01185 #if defined(VMDUSECUDAGDS)
01186   if (usecufile) {
01187     cuFileBufDeregister(devptr);
01188   }
01189 #endif
01190 
01191   if (hostfds != NULL) {
01192     int i;
01193     for (i=0; i<nfiles; i++) {
01194       fio_fclose(hostfds[i]);
01195     }
01196     free(hostfds);
01197   }
01198 
01199   if (hostptr != NULL) {
01200     if (pinhostiobuffer) {
01201       cudaFreeHost(hostptr);
01202     } else {
01203       free(hostptr_unaligned);
01204     }
01205   }
01206 
01207   return NULL;
01208 }
01209 
01210 #endif
01211 
01212 
01213 int gpu_ooc_bench(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
01214                   int nfiles, const char **trjfileset, const AtomSel *sel,
01215                   int first, int last, int step) {
01216   printf("gpu_ooc_bench()\n");
01217   wkf_threadpool_t *bigpool = NULL;
01218 
01219 #if defined(VMDUSECUDAGDS)
01220   int devcount;
01221   cudaError_t crc = cudaGetDeviceCount(&devcount);
01222   printf("gpu_ooc_bench) GPU device count: %d\n", devcount);
01223   if (devcount==0)
01224     printf("gpu_ooc_bench)   No GPU devices, continuing with host only...\n");
01225 
01226   CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01227   CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t));
01228   memset(&cfh[0], 0, sizeof(cfh));
01229   memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01230 
01231   int natomschk = 0;
01232   jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01233   fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01234 
01235   int i;
01236   for (i=0; i<nfiles; i++) {
01237     const char *filename = trjfileset[i];
01238     printf("gpu_ooc_bench) File[%d] GDS setup, opening '%s'\n", i, filename);
01239     jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01240     if (!jshandles[i]) {
01241       printf("gpu_ooc_bench) File[%d] open_js_read failed for file %s\n", i, filename);
01242       return -1; // deal with error handling later
01243     }
01244 
01245 #if vmdplugin_ABIVERSION > 17
01246     long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01247     int filepgalignsz = 1;
01248     read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01249     if (filepgalignsz != blocksz) {
01250       printf("gpu_ooc_bench) File[%d] Plugin-returned page alignment size mismatch!\n", i);
01251     } else {
01252       printf("gpu_ooc_bench) File[%d] Page alignment size: %d\n", i, filepgalignsz);
01253     }
01254 #endif
01255 
01256     read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0,
01257                                    &directio_fds[i], NULL, NULL, NULL);
01258 
01259     cfhdesc[i].handle.fd = directio_fds[i]; // typedef of Unix FD
01260     cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01261     CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01262 
01263     if (cferr.err != CU_FILE_SUCCESS) {
01264       printf("gpu_ooc_bench) File[%d] cuFileImportExternalFile on fd %d failed!\n",
01265              i, cfhdesc[i].handle.fd);
01266       return -1; // XXX error handling needs to be done here
01267     }
01268   }
01269 
01270 
01271   //
01272   // copy in per-thread parameters
01273   //
01274   gpuoocbenchthreadparms parms;
01275   memset(&parms, 0, sizeof(parms));
01276   parms.devcount = devcount;
01277   parms.nfiles = nfiles;
01278   parms.trjfileset = trjfileset;
01279   parms.jshandles = jshandles;
01280   parms.cfh = cfh;
01281   parms.natoms = sel->num_atoms;
01282   parms.sel = sel;
01283   parms.first = first;
01284   parms.last = last;
01285   parms.step = step;
01286 
01287   int framecount = nfiles * (last / step);
01288 
01289   // create timers
01290   wkf_timerhandle timer;
01291   timer=wkf_timer_create();
01292 
01293   // spawn child threads to do the work
01294   wkf_tasktile_t tile;
01295   tile.start=0;
01296   tile.end=framecount - 1; // first row only
01297 
01298   printf("gpu_ooc_bench) tile start: %d  end: %d\n", tile.start, tile.end);
01299 
01300   int gdsthreadspergpu = 1;
01301   if (getenv("VMDGDSTHREADSPERGPU") != NULL)
01302     gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01303 
01304   printf("gpu_ooc_bench) gdsthreadspergpu: %d\n", gdsthreadspergpu);
01305 
01306   if (gdsthreadspergpu > 1) {
01307     // XXX extra-large GPU device thread pool
01308     int workercount = devcount * gdsthreadspergpu;
01309 
01310     int *devlist = new int[workercount];
01311     int k;
01312     for (k=0; k<workercount; k++) {
01313       devlist[k] = k / gdsthreadspergpu; // XXX ignores VMD CUDA device masks
01314     }
01315 
01316     msgInfo << "Creating Multi-worker ("
01317             << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01318     bigpool=wkf_threadpool_create(workercount, devlist);
01319     delete [] devlist;
01320 
01321     // associate each worker thread with a specific GPU
01322     if (getenv("VMDCUDAVERBOSE") != NULL)
01323       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01324     else
01325       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01326 
01327     // clear all available device memory on each of the GPUs
01328     wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01329 
01330     // XXX override which GPU device pool we're going to use
01331     devpool = bigpool;
01332   }
01333 
01334   // XXX affinitize GPU worker threads for best perf
01335   wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01336 
01337   wkf_threadpool_sched_dynamic(devpool, &tile);
01338   wkf_timer_start(timer);
01339   wkf_threadpool_launch(devpool, gpu_ooc_bench_thread, &parms, 1);
01340   wkf_timer_stop(timer);
01341 
01342   double runtime = wkf_timer_time(timer);
01343   double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01344 
01345   printf("gpu_ooc_bench) natoms: %d, fsz: %ld, tsz: %ld\n",
01346          sel->num_atoms, sel->num_atoms * 12L,
01347          sel->num_atoms * 12L * (tile.end+1));
01348 
01349   int pinhostiobuffer = 1;
01350   if (getenv("VMDGDSHOSTNOPIN"))
01351     pinhostiobuffer=0;
01352 
01353   int hostusedirectio = 1;
01354   if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01355     hostusedirectio = 0;
01356 
01357   int usecufile=1;
01358   if (getenv("VMDGDSUSEHOST"))
01359     usecufile=0;
01360 
01361   if (usecufile) {
01362     printf("OOC I/O via GDS + cuFile\n");
01363   } else {
01364     printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01365            (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01366            (pinhostiobuffer) ? "pinned" : "unpinned");
01367   }
01368 
01369   int multiframeio = 0;
01370   if (getenv("VMDGDSMULTIFRAME"))
01371     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01372   if (multiframeio > VMDGDSMAXFRAMEBUF)
01373     multiframeio = VMDGDSMAXFRAMEBUF;
01374   if (multiframeio) {
01375     printf("GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01376            multiframeio,
01377            multiframeio * sel->num_atoms * 12L);
01378   }
01379 
01380   printf("OOC runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01381 
01382   for (i=0; i<nfiles; i++) {
01383 #if defined(VMDUSECUDAGDS)
01384     cuFileHandleDeregister(cfh[i]);
01385 #endif
01386     close_js_read(jshandles[i]);
01387   }
01388 #endif
01389 
01390 #if defined(VMDUSECUDAGDS)
01391   if (cfh != NULL)
01392     free(cfh);
01393 
01394   if (cfhdesc != NULL)
01395      free(cfhdesc);
01396 
01397   if (jshandles != NULL)
01398     free(jshandles);
01399 
01400   if (directio_fds != NULL)
01401     free(directio_fds);
01402 #endif
01403 
01404   // if we created an extra large-thread-count-per-GPU thread pool, we
01405   // need to destroy it here...
01406   if (bigpool != NULL)
01407     wkf_threadpool_destroy(bigpool);
01408 
01409   return 0;
01410 } 
01411 
01412 
01413