00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00026 #include <stdio.h>
00027 #include <stdlib.h>
00028 #include <string.h>
00029 #include <math.h>
00030 #include <cuda.h>
00031
00032 #include "molfile_plugin.h"
00033
00034 #include "Inform.h"
00035 #include "utilities.h"
00036 #include "WKFThreads.h"
00037 #include "WKFUtils.h"
00038 #include "CUDAKernels.h"
00039 #include "Measure.h"
00040
00041
00042
00043
00044 #if 1
00045 #define RESTRICT __restrict__
00046 #else
00047 #define RESTRICT
00048 #endif
00049
00050
00051 #if defined(VMDUSECUDAGDS)
00052 #include </usr/local/gds-beta-0.7.1/lib/cufile.h>
00053
00054
00055
00056 #define VMDJSPLUGININCLUDESRC 1
00057 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00058
00059 #if 1
00060 #define CUERR { cudaError_t err; \
00061 if ((err = cudaGetLastError()) != cudaSuccess) { \
00062 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00063 printf("Thread aborting...\n"); \
00064 return NULL; }}
00065 #else
00066 #define CUERR
00067 #endif
00068
00069 typedef struct {
00070 int nfiles;
00071 const char **trjfileset;
00072 jshandle **jshandles;
00073 CUfileHandle_t *cfh;
00074 int devcount;
00075 int natoms;
00076 const AtomSel *sel;
00077 int first;
00078 int last;
00079 int step;
00080 float *rmsdmat;
00081 } gpuqcprmsdmatoocthreadparms;
00082 #else
00083 typedef int fio_fd;
00084 #endif
00085
00086 typedef struct {
00087 const AtomSel *sel;
00088 int first;
00089 int last;
00090 int step;
00091 float *rmsdmat;
00092 int padcnt;
00093 int framecrdsz;
00094 float *crds;
00095 } gpuqcprmsdthreadparms;
00096
00097
00098
00099
00100
00101
00102 __device__ unsigned int glob_block_count = 0;
00103
00104
00105
00106
00107
00108 template <typename T>
00109 __inline__ __device__ T warp_sum_reduction(T v) {
00110 for (int offset = warpSize/2; offset > 0; offset >>=1 ) {
00111 #if CUDART_VERSION >= 9000
00112 v += __shfl_down_sync(0xffffffff, v, offset);
00113 #else
00114 v += __shfl_down(v, offset);
00115 #endif
00116 }
00117 return v;
00118 }
00119
00120
00121
00122
00123
00124
00125 template <typename T>
00126 __inline__ __device__ T block_sum_reduction(T v) {
00127 static __shared__ T shared[32];
00128 int lane = threadIdx.x % warpSize;
00129 int wid = threadIdx.x / warpSize;
00130
00131 v = warp_sum_reduction(v);
00132
00133 __syncthreads();
00134
00135 if (lane == 0)
00136 shared[wid] = v;
00137
00138 __syncthreads();
00139
00140 if (threadIdx.x < blockDim.x / warpSize) {
00141 v = shared[lane];
00142 } else {
00143 v = 0;
00144 }
00145
00146 if (wid == 0)
00147 v = warp_sum_reduction(v);
00148
00149 return v;
00150 }
00151
00152
00153
00154
00155
00156 __global__ static void
00157 vmd_float3_aos_to_soa(int natoms, float3 *xyz,
00158 float *crdx, float *crdy, float *crdz) {
00159 unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00160 unsigned int tcnt = gridDim.x * blockDim.x;
00161 int i;
00162 for (i=tid; i<natoms; i+=tcnt) {
00163 float3 crd = xyz[i];
00164 crdx[i] = crd.x;
00165 crdy[i] = crd.y;
00166 crdz[i] = crd.z;
00167 }
00168 }
00169
00170
00171
00172
00173
00174 __global__ static void
00175 vmd_float3_aos_to_soa_selected(int natoms, int *idxmap, float3 *xyz,
00176 float *crdx, float *crdy, float *crdz) {
00177 unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00178 unsigned int tcnt = gridDim.x * blockDim.x;
00179 int i;
00180 for (i=tid; i<natoms; i+=tcnt) {
00181 int dst = idxmap[i];
00182 if (dst > 0) {
00183 float3 crd = xyz[i];
00184 crdx[dst] = crd.x;
00185 crdy[dst] = crd.y;
00186 crdz[dst] = crd.z;
00187 }
00188 }
00189 }
00190
00191
00192 int * vmd_gpu_selection_indexlist(const AtomSel *sel) {
00193 int i, j;
00194 int *cpuidxlist = new int[sel->selected];
00195
00196 for (j=0,i=sel->firstsel; i<=sel->lastsel; i++) {
00197 if (sel->on[i]) {
00198 cpuidxlist[j] = i;
00199 j++;
00200 }
00201 }
00202
00203 int *gpuidxlist = NULL;
00204 cudaMalloc((void**) &gpuidxlist, sel->selected * sizeof(int));
00205 cudaMemcpy(gpuidxlist, cpuidxlist, sel->selected * sizeof(int),
00206 cudaMemcpyHostToDevice);
00207 delete [] cpuidxlist;
00208
00209 return gpuidxlist;
00210 }
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230 __global__ static void
00231 vmd_qcp_innerprod_soa_devicewide(double *pr,
00232 const float * RESTRICT crdx1,
00233 const float * RESTRICT crdy1,
00234 const float * RESTRICT crdz1,
00235 const float * RESTRICT crdx2,
00236 const float * RESTRICT crdy2,
00237 const float * RESTRICT crdz2,
00238 const int natoms,
00239 const float * RESTRICT weight) {
00240 unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00241 unsigned int tcnt = gridDim.x * blockDim.x;
00242
00243 __shared__ int isLastBlock[1];
00244 if(threadIdx.x == 0) {
00245 isLastBlock[0] = 0;
00246 }
00247 __syncthreads();
00248
00249 double G, a0, a1, a2, a3, a4, a5, a6, a7, a8;
00250 G=a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00251
00252 int start = tid;
00253 int stop = natoms;
00254 if (weight != NULL) {
00255 for (int i=start; i<stop; i+=tcnt) {
00256 float fw = weight[i];
00257 float fx1 = crdx1[i];
00258 float fy1 = crdy1[i];
00259 float fz1 = crdz1[i];
00260
00261 float fx2 = crdx2[i];
00262 float fy2 = crdy2[i];
00263 float fz2 = crdz2[i];
00264
00265 G += fw * ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00266
00267 a0 += fx1 * fx2;
00268 a1 += fx1 * fy2;
00269 a2 += fx1 * fz2;
00270
00271 a3 += fy1 * fx2;
00272 a4 += fy1 * fy2;
00273 a5 += fy1 * fz2;
00274
00275 a6 += fz1 * fx2;
00276 a7 += fz1 * fy2;
00277 a8 += fz1 * fz2;
00278 }
00279 } else {
00280 for (int i=start; i<stop; i+=tcnt) {
00281 float fx1 = crdx1[i];
00282 float fy1 = crdy1[i];
00283 float fz1 = crdz1[i];
00284
00285 float fx2 = crdx2[i];
00286 float fy2 = crdy2[i];
00287 float fz2 = crdz2[i];
00288
00289 G += ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00290
00291 a0 += fx1 * fx2;
00292 a1 += fx1 * fy2;
00293 a2 += fx1 * fz2;
00294
00295 a3 += fy1 * fx2;
00296 a4 += fy1 * fy2;
00297 a5 += fy1 * fz2;
00298
00299 a6 += fz1 * fx2;
00300 a7 += fz1 * fy2;
00301 a8 += fz1 * fz2;
00302 }
00303 }
00304
00305 __syncthreads();
00306
00307 G *= 0.5;
00308
00309
00310 double bG = block_sum_reduction(G);
00311 double ba0 = block_sum_reduction(a0);
00312 double ba1 = block_sum_reduction(a1);
00313 double ba2 = block_sum_reduction(a2);
00314 double ba3 = block_sum_reduction(a3);
00315 double ba4 = block_sum_reduction(a4);
00316 double ba5 = block_sum_reduction(a5);
00317 double ba6 = block_sum_reduction(a6);
00318 double ba7 = block_sum_reduction(a7);
00319 double ba8 = block_sum_reduction(a8);
00320
00321 __syncthreads();
00322
00323
00324 if (threadIdx.x == 0) {
00325 pr[(0*gridDim.x)+blockIdx.x] = ba0;
00326 pr[(1*gridDim.x)+blockIdx.x] = ba1;
00327 pr[(2*gridDim.x)+blockIdx.x] = ba2;
00328 pr[(3*gridDim.x)+blockIdx.x] = ba3;
00329 pr[(4*gridDim.x)+blockIdx.x] = ba4;
00330 pr[(5*gridDim.x)+blockIdx.x] = ba5;
00331 pr[(6*gridDim.x)+blockIdx.x] = ba6;
00332 pr[(7*gridDim.x)+blockIdx.x] = ba7;
00333 pr[(8*gridDim.x)+blockIdx.x] = ba8;
00334 pr[(9*gridDim.x)+blockIdx.x] = bG;
00335
00336 __threadfence();
00337
00338
00339
00340
00341 unsigned int old_block_count = atomicInc(&glob_block_count, gridDim.x);
00342 isLastBlock[0] = ( old_block_count == (gridDim.x -1) );
00343 }
00344
00345 __syncthreads();
00346
00347
00348
00349 if (isLastBlock[0]) {
00350 glob_block_count = 0;
00351 __threadfence();
00352
00353
00354 for (int l=0; l < 10; ++l) {
00355 double *pr_a = pr+(l*gridDim.x);
00356
00357
00358 float sum = 0;
00359 for (int j = threadIdx.x; j < gridDim.x; j += blockDim.x) {
00360 sum += pr_a[j];
00361 }
00362
00363 sum = block_sum_reduction(sum);
00364 if (threadIdx.x==0)
00365 pr_a[0]=sum;
00366 }
00367 }
00368 }
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387 __global__ static void
00388 vmd_qcp_innerprod_soa_blockperpair(double *results,
00389 const float * RESTRICT crdx1,
00390 const float * RESTRICT crdy1,
00391 const float * RESTRICT crdz1,
00392 const float * RESTRICT crdx2,
00393 const float * RESTRICT crdy2,
00394 const float * RESTRICT crdz2,
00395 const int natoms,
00396 const float * RESTRICT weight,
00397 const int num_structs) {
00398 unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00399 unsigned int tcnt = gridDim.x * blockDim.x;
00400
00401 double G, a0, a1, a2, a3, a4, a5, a6, a7, a8;
00402
00403
00404 for (int struct_id = blockIdx.x; struct_id < num_structs; struct_id+=gridDim.x) {
00405 G=a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00406 int struct_offset = struct_id * natoms * sizeof(float);
00407 int start = struct_offset + tid;
00408 int stop = struct_offset + natoms;
00409 if (weight != NULL) {
00410 for (int i=start; i<stop; i+=tcnt) {
00411 float fw = weight[i];
00412 float fx1 = crdx1[i];
00413 float fy1 = crdy1[i];
00414 float fz1 = crdz1[i];
00415
00416 float fx2 = crdx2[i];
00417 float fy2 = crdy2[i];
00418 float fz2 = crdz2[i];
00419
00420 G += fw * ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00421
00422 a0 += fx1 * fx2;
00423 a1 += fx1 * fy2;
00424 a2 += fx1 * fz2;
00425
00426 a3 += fy1 * fx2;
00427 a4 += fy1 * fy2;
00428 a5 += fy1 * fz2;
00429
00430 a6 += fz1 * fx2;
00431 a7 += fz1 * fy2;
00432 a8 += fz1 * fz2;
00433 }
00434 } else {
00435 for (int i=start; i<stop; i+=tcnt) {
00436 float fx1 = crdx1[i];
00437 float fy1 = crdy1[i];
00438 float fz1 = crdz1[i];
00439
00440 float fx2 = crdx2[i];
00441 float fy2 = crdy2[i];
00442 float fz2 = crdz2[i];
00443
00444 G += ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00445
00446 a0 += fx1 * fx2;
00447 a1 += fx1 * fy2;
00448 a2 += fx1 * fz2;
00449
00450 a3 += fy1 * fx2;
00451 a4 += fy1 * fy2;
00452 a5 += fy1 * fz2;
00453
00454 a6 += fz1 * fx2;
00455 a7 += fz1 * fy2;
00456 a8 += fz1 * fz2;
00457 }
00458 }
00459
00460 __syncthreads();
00461
00462 G *= 0.5;
00463
00464
00465 double bG = block_sum_reduction(G);
00466 double ba0 = block_sum_reduction(a0);
00467 double ba1 = block_sum_reduction(a1);
00468 double ba2 = block_sum_reduction(a2);
00469 double ba3 = block_sum_reduction(a3);
00470 double ba4 = block_sum_reduction(a4);
00471 double ba5 = block_sum_reduction(a5);
00472 double ba6 = block_sum_reduction(a6);
00473 double ba7 = block_sum_reduction(a7);
00474 double ba8 = block_sum_reduction(a8);
00475
00476 __syncthreads();
00477
00478
00479 if (threadIdx.x == 0) {
00480 int addr = struct_id * 10;
00481 results[addr ] = ba0;
00482 results[addr + 1] = ba1;
00483 results[addr + 2] = ba2;
00484 results[addr + 3] = ba3;
00485 results[addr + 4] = ba4;
00486 results[addr + 5] = ba5;
00487 results[addr + 6] = ba6;
00488 results[addr + 7] = ba7;
00489 results[addr + 8] = ba8;
00490 results[addr + 9] = bG;
00491 }
00492 }
00493 }
00494
00495
00496
00497 static int idx2sub_tril(long N, long ind, long *J, long *I) {
00498 if (ind > (N*(N+1)/2)) {
00499 return -1;
00500 }
00501
00502 #if 1
00503 long i = long(N - 2 - floor(sqrt(-8*ind + 4*N*(N-1)-7)/2.0 - 0.5));
00504 long j = ind + i + 1 - N*(N-1)/2 + (N-i)*((N-i)-1)/2;
00505 #elif 0
00506 long i, j;
00507
00508
00509
00510
00511
00512 double tmp2np1 = 2*N+1;
00513 i = floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2);
00514 j = ind - N*i + i*(i-1)/2 + i;
00515 #endif
00516
00517 *I = i;
00518 *J = j;
00519
00520 return 0;
00521 }
00522
00523
00524 static void * measure_rmsdmat_qcp_thread(void *voidparms) {
00525 int threadid;
00526 gpuqcprmsdthreadparms *parms = NULL;
00527 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00528 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00529
00530
00531
00532
00533 float *rmsdmat = parms->rmsdmat;
00534
00535 #if 0
00536 int framecrdsz = parms->framecrdsz;
00537 float *crds = parms->crds;
00538 #endif
00539 const AtomSel *sel = parms->sel;
00540 int first = parms->first;
00541 int last = parms->last;
00542 int step = parms->step;
00543
00544 #if 1
00545 printf("QCP GPU[%d] running...\n", threadid);
00546 #endif
00547
00548 int framecount = (last - first + 1) / step;
00549
00550 wkf_tasktile_t tile;
00551 while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
00552 long idx;
00553
00554 for (idx=tile.start; idx<tile.end; idx++) {
00555 long i, j;
00556
00557
00558
00559 if (idx2sub_tril(framecount-1, idx, &i, &j)) {
00560 printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx);
00561 break;
00562 }
00563
00564
00565 double A[9];
00566 double E0 = 0;
00567
00568 #if 0
00569 float *xj = crds + (j * 3 * framecrdsz);
00570 float *yj = xj + framecrdsz;
00571 float *zj = xj + framecrdsz*2;
00572
00573 float *xi = crds + (i * 3 * framecrdsz);
00574 float *yi = xi + framecrdsz;
00575 float *zi = xi + framecrdsz*2;
00576
00577 E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
00578 sel->selected, NULL );
00579 #endif
00580
00581
00582 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
00583 E0, sel->selected, -1);
00584 }
00585 }
00586
00587 return NULL;
00588 }
00589
00590
00591 int qcp_soa_gpu(wkf_threadpool_t *devpool,
00592 const AtomSel *sel, float *crds, int framecrdsz, int padcnt,
00593 int first, int last, int step, int framecount,
00594 float *rmsdmat) {
00595
00596
00597
00598 gpuqcprmsdthreadparms parms;
00599 memset(&parms, 0, sizeof(parms));
00600
00601 parms.sel = sel;
00602 parms.rmsdmat = rmsdmat;
00603 parms.padcnt = padcnt;
00604 parms.framecrdsz = framecrdsz;
00605 parms.crds = crds;
00606 parms.first = first;
00607 parms.last = last;
00608 parms.step = step;
00609
00610
00611 wkf_tasktile_t tile;
00612 tile.start=0;
00613 tile.end=(framecount-1)*(framecount-1)/2;
00614
00615 #if 1
00616 wkf_threadpool_sched_dynamic(devpool, &tile);
00617 wkf_threadpool_launch(devpool, measure_rmsdmat_qcp_thread, &parms, 1);
00618 #else
00619 int i, j;
00620 for (j=0; j<framecount; j++) {
00621 float *xj = crds + (j * 3 * framecrdsz);
00622 float *yj = xj + framecrdsz;
00623 float *zj = xj + framecrdsz*2;
00624 for (i=0; i<j; i++) {
00625
00626 double A[9];
00627
00628 float *xi = crds + (i * 3 * framecrdsz);
00629 float *yi = xi + framecrdsz;
00630 float *zi = xi + framecrdsz*2;
00631
00632 double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
00633 sel->selected, NULL );
00634
00635
00636 FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
00637 E0, sel->selected, -1);
00638 }
00639 }
00640 #endif
00641
00642 return 0;
00643 }
00644
00645
00646
00647 #if defined(VMDUSECUDAGDS)
00648 #define VMDGDSMAXFRAMEBUF 8
00649 static void * measure_rmsdmat_qcp_ooc_thread(void *voidparms) {
00650 int threadid;
00651 gpuqcprmsdmatoocthreadparms *parms = NULL;
00652 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00653 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00654
00655
00656
00657
00658 float *rmsdmat = parms->rmsdmat;
00659
00660 int nfiles = parms->nfiles;
00661 int natoms = parms->natoms;
00662 const AtomSel *sel = parms->sel;
00663 int first = parms->first;
00664 int last = parms->last;
00665 int step = parms->step;
00666
00667 int usecufile = 1;
00668 fio_fd *hostfds = NULL;
00669 int pinhostiobuffer = 1;
00670 if (getenv("VMDGDSHOSTNOPIN")) {
00671 pinhostiobuffer=0;
00672 }
00673
00674 if (getenv("VMDGDSUSEHOST")) {
00675 usecufile=0;
00676 hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00677
00678 int hostusedirectio = 1;
00679 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00680 hostusedirectio = 0;
00681
00682 int openmode = FIO_READ;
00683 if (hostusedirectio)
00684 openmode |= FIO_DIRECT;
00685
00686 int i;
00687 for (i=0; i<nfiles; i++) {
00688 if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00689 if (hostusedirectio) {
00690 printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n",
00691 threadid, parms->trjfileset[i]);
00692 } else {
00693 printf("Thr[%d] can't open file '%s'\n",
00694 threadid, parms->trjfileset[i]);
00695 }
00696 return NULL;
00697 }
00698 #if 0
00699 else {
00700 printf("Thr[%d]: fd[%d]: %d, '%s'\n", threadid, i, hostfds[i],
00701 parms->trjfileset[i]);
00702 }
00703 #endif
00704 }
00705 }
00706
00707 if (hostfds && usecufile) {
00708 printf("Inconsistent cufile/hostfds state, aborting!\n");
00709 return NULL;
00710 }
00711
00712
00713
00714 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00715 long sz = 3L*sizeof(float)*natoms + blocksz;
00716
00717
00718 size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
00719
00720 int framecount = (last - first + 1) / step;
00721 int framesperfile = framecount / nfiles;
00722
00723 if (threadid == 0) {
00724 printf("Thr[%2d] running frame: %d natoms: %d selected: %d\n",
00725 framecount, threadid, natoms, sel->selected);
00726 }
00727
00728 cudaError_t crc;
00729 cudaStream_t qcpstream;
00730 double *pr=NULL;
00731 float *devptr=NULL;
00732 float *hostptr=NULL;
00733 float *hostptr_unaligned=NULL;
00734
00735 float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
00736 float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
00737 int *gpuidxlist = NULL;
00738 int multiframeio = 0;
00739 if (getenv("VMDGDSMULTIFRAME"))
00740 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
00741 if (multiframeio > VMDGDSMAXFRAMEBUF)
00742 multiframeio = VMDGDSMAXFRAMEBUF;
00743
00744
00745 dim3 QCPBsz = dim3(256, 1, 1);
00746 dim3 QCPGsz = dim3((natoms + QCPBsz.x - 1) / QCPBsz.x, 1, 1);
00747
00748 if (parms->devcount > 0) {
00749 long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
00750
00751 if (threadid == 0) {
00752 printf("Thr[%2d] Allocating QCP GPU timestep buf: %ld \n",
00753 threadid, gpuallocsz);
00754 }
00755 crc = cudaMalloc((void**) &devptr, gpuallocsz);
00756
00757 if (hostfds != NULL) {
00758 if (pinhostiobuffer) {
00759 crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
00760 CUERR
00761 } else {
00762 hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096,
00763 (void**) &hostptr_unaligned);
00764 }
00765 }
00766
00767 gpuidxlist = vmd_gpu_selection_indexlist(sel);
00768 long crdsz = sel->selected * sizeof(float);
00769
00770 crc = cudaMalloc((void**) &pr, 10 * sizeof(double) * (QCPGsz.x + 1));
00771
00772
00773 crc = cudaMalloc((void**) &crdx1, crdsz);
00774 crc = cudaMalloc((void**) &crdy1, crdsz);
00775 crc = cudaMalloc((void**) &crdz1, crdsz);
00776 crc = cudaMalloc((void**) &crdx2, crdsz);
00777 crc = cudaMalloc((void**) &crdy2, crdsz);
00778 crc = cudaMalloc((void**) &crdz2, crdsz);
00779 if (crc != cudaSuccess) {
00780 printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
00781 return NULL;
00782 }
00783
00784 cudaStreamCreate(&qcpstream);
00785
00786 #if defined(VMDUSECUDAGDS)
00787 cuFileBufRegister(devptr, gpuallocsz, 0);
00788 #endif
00789 }
00790
00791
00792 #if 1
00793 #define VMDGDSIOBENCHMARKONLY 1
00794 int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
00795
00796 wkf_tasktile_t tile;
00797 while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
00798
00799
00800
00801 int idx;
00802 for (idx=tile.start; idx<tile.end; idx++) {
00803 int myfileidx = (threadid / 4) % nfiles;
00804 int fileframeidx = idx % framesperfile;
00805
00806
00807
00808
00809 long startoffset, foffset, readlen;
00810 read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
00811 fileframeidx,
00812 0, natoms, NULL,
00813 &startoffset, &foffset, &readlen);
00814 if (multiframeio) {
00815
00816
00817
00818 long multistartoffset, multifoffset, multireadlen;
00819 read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
00820 fileframeidx+multiframeio-1,
00821 0, natoms, NULL,
00822 &multistartoffset, &multifoffset,
00823 &multireadlen);
00824
00825 multireadlen = (multifoffset + multireadlen) - foffset;
00826
00827
00828 readlen = multireadlen;
00829 idx+=multiframeio-1;
00830 }
00831
00832
00833
00834
00835 long ret=0;
00836 if (usecufile) {
00837 ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
00838 } else if (hostfds) {
00839 foffset=0;
00840 ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
00841 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
00842 ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
00843 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
00844 cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
00845 CUERR
00846 } else {
00847 printf("Inconsistent cufile/hostfds state, aborting!\n");
00848 return NULL;
00849 }
00850
00851
00852 if (ret < 0) {
00853 printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
00854 return NULL;
00855 }
00856
00857 #if 0
00858 float hostbuf[64];
00859 cudaMemcpy(hostbuf, devptr, sizeof(hostbuf), cudaMemcpyDeviceToHost);
00860 int l;
00861 for (l=0; l<10; l++) {
00862 printf("%.1f ", hostbuf[l]);
00863 }
00864 printf("\n");
00865 #endif
00866
00867 float rmsd=0;
00868 #if 0
00869 cudaDeviceSynchronize();
00870 CUERR
00871
00872 dim3 Bsz = dim3(256, 1, 1);
00873 dim3 Gsz = dim3((natoms + Bsz.x - 1) / Bsz.x, 1, 1);
00874 if (sel->selected == natoms) {
00875 vmd_float3_aos_to_soa<<<Gsz, Bsz, 0, qcpstream>>>(natoms, (float3 *) devptr, crdx1, crdy1, crdz1);
00876 } else {
00877 vmd_float3_aos_to_soa_selected<<<Gsz, Bsz, 0, qcpstream>>>(natoms, gpuidxlist, (float3 *) devptr, crdx1, crdy1, crdz1);
00878 }
00879
00880 vmd_qcp_innerprod_soa_devicewide<<<QCPGsz, QCPBsz, 0, qcpstream>>>(pr, crdx1, crdy1, crdz1, crdx1, crdy1, crdz1, natoms, NULL);
00881
00882 double pr_host[10];
00883 cudaMemcpy(pr_host, pr, 10 * sizeof(double), cudaMemcpyDeviceToHost);
00884 #if 0
00885 printf("Thr[%2d] pr_host %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f\n",
00886 threadid, pr_host[0], pr_host[1], pr_host[2],
00887 pr_host[3], pr_host[4], pr_host[5],
00888 pr_host[6], pr_host[7], pr_host[8], pr_host[9]);
00889 #endif
00890
00891
00892 double E0 = pr_host[9];
00893 FastCalcRMSDAndRotation(NULL, pr_host, &rmsd, E0, sel->selected, -1);
00894
00895 #endif
00896
00897 if (verbose) {
00898 printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d RMSD: %4.1f cuFile len: %ld off: %ld\n",
00899 threadid, myfileidx, tile.start, tile.end, idx, rmsd,
00900 readlen, foffset);
00901 }
00902 }
00903 }
00904
00905 #else
00906
00907
00908
00909
00910 int xfileidx = -1;
00911 int yfileidx = -1;
00912 wkf_tasktile_t tile;
00913 long lastx = -1, lasty = -1;
00914 while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
00915 int idx;
00916 for (idx=tile.start; idx<tile.end; idx++) {
00917
00918
00919
00920
00921
00922
00923
00924 long x=-1, y=-1, ret=0;
00925 if (idx2sub_tril(framecount-1, idx, &x, &y)) {
00926 printf("qcpthread[%d]: work idx %d out of triangle!\n", threadid, idx);
00927 break;
00928 }
00929
00930 long startoffset, foffset, readlen;
00931
00932
00933
00934
00935 if (lasty != y) {
00936
00937
00938 yfileidx = (threadid / 4) % nfiles;
00939 long fileframeidx = y % framesperfile;
00940 read_js_timestep_index_offsets(parms->jshandles[yfileidx],
00941 natoms, fileframeidx,
00942 0, natoms, NULL,
00943 &startoffset, &foffset, &readlen);
00944 printf("Thr[%2d] pair(%3ld x %3ld) cuFileRead(Y): offset %ld readlen: %ld\n",
00945 threadid, x, y, foffset, readlen);
00946 lasty = y;
00947 }
00948
00949
00950 if (ret >= 0 && lastx != x) {
00951
00952
00953 xfileidx = (threadid / 4) % nfiles;
00954 long fileframeidx = x % framesperfile;
00955 read_js_timestep_index_offsets(parms->jshandles[xfileidx],
00956 natoms, fileframeidx,
00957 0, natoms, NULL,
00958 &startoffset, &foffset, &readlen);
00959 printf("Thr[%2d] pair(%3ld x %3ld) cuFileRead(X): offset %ld readlen: %ld\n",
00960 threadid, x, y, foffset, readlen);
00961 ret = cuFileRead(parms->cfh[xfileidx], (char *) devptr+blockpadsz, readlen, foffset);
00962 lastx = x;
00963 }
00964
00965
00966 if (ret < 0) {
00967 const char *descp = "unknown error code";
00968 #if 0
00969
00970
00971 if (cuGetErrorName(ret, &descp) != CUDA_SUCCESS)
00972 descp = "unknown cuda error";
00973 #endif
00974
00975 printf("Thr[%2d] Error: cuFileRead(): %ld, '%s'\n", threadid, ret, descp);
00976 return NULL;
00977 }
00978
00979
00980
00981
00982
00983
00984
00985
00986 cudaDeviceSynchronize();
00987 CUERR
00988
00989 dim3 Bsz = dim3(256, 1, 1);
00990 dim3 Gsz = dim3((natoms + Bsz.x - 1) / Bsz.x, 1, 1);
00991 if (sel->selected == natoms) {
00992 vmd_float3_aos_to_soa<<<Gsz, Bsz, 0, qcpstream>>>(natoms, (float3 *) devptr, crdx1, crdy1, crdz1);
00993 } else {
00994 vmd_float3_aos_to_soa_selected<<<Gsz, Bsz, 0, qcpstream>>>(natoms, gpuidxlist, (float3 *) devptr, crdx1, crdy1, crdz1);
00995 }
00996
00997 vmd_qcp_innerprod_soa_devicewide<<<QCPGsz, QCPBsz, 0, qcpstream>>>(pr, crdx1, crdy1, crdz1, crdx1, crdy1, crdz1, natoms, NULL);
00998
00999 #if 0
01000 double pr_host[10];
01001 cudaMemcpy(pr_host, pr, 10 * sizeof(double), cudaMemcpyDeviceToHost);
01002
01003
01004 float rmsd=0;
01005 double E0 = pr_host[9];
01006 FastCalcRMSDAndRotation(NULL, pr_host, &rmsd, E0, sel->selected, -1);
01007
01008 #endif
01009 }
01010 }
01011 #endif
01012
01013 cudaFree(gpuidxlist);
01014 cudaFree(pr);
01015 cudaFree(crdx1);
01016 cudaFree(crdy1);
01017 cudaFree(crdz1);
01018 cudaFree(crdx2);
01019 cudaFree(crdy2);
01020 cudaFree(crdz2);
01021
01022 #if defined(VMDUSECUDAGDS)
01023 if (usecufile) {
01024 cuFileBufDeregister(devptr);
01025 }
01026 #endif
01027
01028 if (hostfds != NULL) {
01029 int i;
01030 for (i=0; i<nfiles; i++) {
01031 fio_fclose(hostfds[i]);
01032 }
01033 free(hostfds);
01034 }
01035
01036 if (hostptr != NULL) {
01037 if (pinhostiobuffer) {
01038 cudaFreeHost(hostptr);
01039 } else {
01040 free(hostptr_unaligned);
01041 }
01042 }
01043
01044 return NULL;
01045 }
01046
01047 #endif
01048
01049
01050 int qcp_soa_gpu_ooc(wkf_threadpool_t *devpool,
01051 int nfiles, const char **trjfileset,
01052 const AtomSel *sel,
01053 int first, int last, int step, float *rmsdmat) {
01054 printf("qcp_soa_gpu_ooc()\n");
01055 wkf_threadpool_t *bigpool = NULL;
01056
01057 #if defined(VMDUSECUDAGDS)
01058 int devcount;
01059 cudaError_t crc = cudaGetDeviceCount(&devcount);
01060 printf(" GPU device count: %d\n", devcount);
01061 if (devcount==0)
01062 printf(" No GPU devices, continuing with host only...\n");
01063
01064 CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01065 CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t));
01066 memset(&cfh[0], 0, sizeof(cfh));
01067 memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01068
01069 int natomschk = 0;
01070 jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01071 fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01072
01073 int i;
01074 for (i=0; i<nfiles; i++) {
01075 const char *filename = trjfileset[i];
01076 printf("File[%d] GDS setup, opening '%s'\n", i, filename);
01077 jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01078 if (!jshandles[i]) {
01079 printf("File[%d] open_js_read failed for file %s\n", i, filename);
01080 return -1;
01081 }
01082
01083 #if vmdplugin_ABIVERSION > 17
01084 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01085 int filepgalignsz = 1;
01086 read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01087 if (filepgalignsz != blocksz) {
01088 printf("File[%d] Plugin-returned page alignment size mismatch!\n", i);
01089 } else {
01090 printf("File[%d] Page alignment size: %d\n", i, filepgalignsz);
01091 }
01092 #endif
01093
01094 read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0,
01095 &directio_fds[i], NULL, NULL, NULL);
01096
01097 cfhdesc[i].handle.fd = directio_fds[i];
01098 cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01099 CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01100
01101 if (cferr.err != CU_FILE_SUCCESS) {
01102 printf("File[%d] cuFileImportExternalFile on fd %d failed!\n",
01103 i, cfhdesc[i].handle.fd);
01104 return -1;
01105 }
01106 }
01107
01108
01109
01110
01111
01112 gpuqcprmsdmatoocthreadparms parms;
01113 memset(&parms, 0, sizeof(parms));
01114 parms.devcount = devcount;
01115 parms.nfiles = nfiles;
01116 parms.trjfileset = trjfileset;
01117 parms.jshandles = jshandles;
01118 parms.cfh = cfh;
01119 parms.natoms = sel->num_atoms;
01120 parms.sel = sel;
01121 parms.rmsdmat = rmsdmat;
01122 parms.first = first;
01123 parms.last = last;
01124 parms.step = step;
01125
01126 int framecount = nfiles * (last / step);
01127
01128
01129 wkf_timerhandle timer;
01130 timer=wkf_timer_create();
01131
01132
01133 wkf_tasktile_t tile;
01134 tile.start=0;
01135 #if defined(VMDGDSIOBENCHMARKONLY)
01136 tile.end=framecount - 1;
01137 #elif 1
01138 tile.end=framecount*16 - 1;
01139 #else
01140 tile.end=(framecount-1)*(framecount-1)/2;
01141 #endif
01142
01143 printf("** qcp_soa_gpu_ooc(): tile start: %d end: %d\n", tile.start, tile.end);
01144
01145 int gdsthreadspergpu = 1;
01146 if (getenv("VMDGDSTHREADSPERGPU") != NULL)
01147 gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01148
01149 printf("** gdsthreadspergpu: %d\n", gdsthreadspergpu);
01150
01151 if (gdsthreadspergpu > 1) {
01152
01153 int workercount = devcount * gdsthreadspergpu;
01154
01155 int *devlist = new int[workercount];
01156 int k;
01157 for (k=0; k<workercount; k++) {
01158 devlist[k] = k / gdsthreadspergpu;
01159 }
01160
01161 msgInfo << "Creating Multi-worker ("
01162 << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01163 bigpool=wkf_threadpool_create(workercount, devlist);
01164 delete [] devlist;
01165
01166
01167 if (getenv("VMDCUDAVERBOSE") != NULL)
01168 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01169 else
01170 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01171
01172
01173 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01174
01175
01176 devpool = bigpool;
01177 }
01178
01179
01180 wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01181
01182 wkf_threadpool_sched_dynamic(devpool, &tile);
01183 wkf_timer_start(timer);
01184 wkf_threadpool_launch(devpool, measure_rmsdmat_qcp_ooc_thread, &parms, 1);
01185 wkf_timer_stop(timer);
01186
01187 double runtime = wkf_timer_time(timer);
01188 double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01189
01190 printf("QCP natoms: %d, fsz: %ld, tsz: %ld\n",
01191 sel->num_atoms, sel->num_atoms * 12L,
01192 sel->num_atoms * 12L * (tile.end+1));
01193
01194 int pinhostiobuffer = 1;
01195 if (getenv("VMDGDSHOSTNOPIN"))
01196 pinhostiobuffer=0;
01197
01198 int hostusedirectio = 1;
01199 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01200 hostusedirectio = 0;
01201
01202 int usecufile=1;
01203 if (getenv("VMDGDSUSEHOST"))
01204 usecufile=0;
01205
01206 if (usecufile) {
01207 printf("OOC I/O via GDS + cuFile\n");
01208 } else {
01209 printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01210 (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01211 (pinhostiobuffer) ? "pinned" : "unpinned");
01212 }
01213
01214 int multiframeio = 0;
01215 if (getenv("VMDGDSMULTIFRAME"))
01216 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01217 if (multiframeio > VMDGDSMAXFRAMEBUF)
01218 multiframeio = VMDGDSMAXFRAMEBUF;
01219 if (multiframeio) {
01220 printf("QCP GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01221 multiframeio,
01222 multiframeio * sel->num_atoms * 12L);
01223 }
01224
01225 printf("QCP runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01226
01227 for (i=0; i<nfiles; i++) {
01228 #if defined(VMDUSECUDAGDS)
01229 cuFileHandleDeregister(cfh[i]);
01230 #endif
01231 close_js_read(jshandles[i]);
01232 }
01233 #endif
01234
01235 #if defined(VMDUSECUDAGDS)
01236 if (cfh != NULL)
01237 free(cfh);
01238
01239 if (cfhdesc != NULL)
01240 free(cfhdesc);
01241
01242 if (jshandles != NULL)
01243 free(jshandles);
01244
01245 if (directio_fds != NULL)
01246 free(directio_fds);
01247 #endif
01248
01249
01250
01251 if (bigpool != NULL)
01252 wkf_threadpool_destroy(bigpool);
01253
01254 return 0;
01255 }