00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include <cuda.h>
00039
00040 #include "Inform.h"
00041 #include "utilities.h"
00042 #include "WKFThreads.h"
00043 #include "WKFUtils.h"
00044 #include "CUDAKernels.h"
00045
00046 typedef struct {
00047 float* atoms;
00048 float* grideners;
00049 long int numplane;
00050 long int numcol;
00051 long int numpt;
00052 long int natoms;
00053 float gridspacing;
00054 } enthrparms;
00055
00056
00057 static void * cudaenergythread(void *);
00058
00059 #if 1
00060 #define CUERR { cudaError_t err; \
00061 if ((err = cudaGetLastError()) != cudaSuccess) { \
00062 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00063 printf("Thread aborting...\n"); \
00064 return NULL; }}
00065 #else
00066 #define CUERR
00067 #endif
00068
00069
00070
00071
00072
00073 #define MAXATOMS 4000
00074 __constant__ static float4 atominfo[MAXATOMS];
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100 #if 1
00101
00102
00103
00104
00105
00106
00107 #define UNROLLX 8
00108 #define UNROLLY 1
00109 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops
00110 #define BLOCKSIZEY 8 // make as small as possible for finer granularity
00111
00112 #else
00113
00114
00115
00116
00117
00118
00119 #define UNROLLX 4
00120 #define UNROLLY 1
00121 #define BLOCKSIZEX 16 // make large enough to allow coalesced global mem ops
00122 #define BLOCKSIZEY 16 // make as small as possible for finer granularity
00123
00124 #endif
00125
00126 #define BLOCKSIZE (BLOCKSIZEX*BLOCKSIZEY)
00127
00128
00129 #if UNROLLX == 8
00130 #define FLOPSPERATOMEVAL (59.0/8.0)
00131 #elif UNROLLX == 4
00132 #define FLOPSPERATOMEVAL (31.0/4.0)
00133 #endif
00134
00135 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00136 unsigned int xindex = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00137 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00138 unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00139
00140 float coory = gridspacing * yindex;
00141 float coorx = gridspacing * xindex;
00142
00143 float energyvalx1=0.0f;
00144 float energyvalx2=0.0f;
00145 float energyvalx3=0.0f;
00146 float energyvalx4=0.0f;
00147 #if UNROLLX > 4
00148 float energyvalx5=0.0f;
00149 float energyvalx6=0.0f;
00150 float energyvalx7=0.0f;
00151 float energyvalx8=0.0f;
00152 #endif
00153
00154 float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165 int atomid;
00166 for (atomid=0; atomid<numatoms; atomid++) {
00167 float dy = coory - atominfo[atomid].y;
00168 float dyz2 = (dy * dy) + atominfo[atomid].z;
00169
00170 float dx1 = coorx - atominfo[atomid].x;
00171 float dx2 = dx1 + gridspacing_coalesce;
00172 float dx3 = dx2 + gridspacing_coalesce;
00173 float dx4 = dx3 + gridspacing_coalesce;
00174 #if UNROLLX > 4
00175 float dx5 = dx4 + gridspacing_coalesce;
00176 float dx6 = dx5 + gridspacing_coalesce;
00177 float dx7 = dx6 + gridspacing_coalesce;
00178 float dx8 = dx7 + gridspacing_coalesce;
00179 #endif
00180
00181 energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00182 energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00183 energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00184 energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00185 #if UNROLLX > 4
00186 energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00187 energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00188 energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00189 energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00190 #endif
00191 }
00192
00193 energygrid[outaddr ] += energyvalx1;
00194 energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00195 energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00196 energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00197 #if UNROLLX > 4
00198 energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00199 energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00200 energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00201 energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00202 #endif
00203 }
00204
00205
00206
00207
00208 #define TILESIZEX BLOCKSIZEX*UNROLLX
00209 #define TILESIZEY BLOCKSIZEY*UNROLLY
00210 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00211 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00212
00213 static int copyatomstoconstbuf(const float *atoms, float *atompre,
00214 int count, float zplane) {
00215 if (count > MAXATOMS) {
00216 printf("Atom count exceeds constant buffer storage capacity\n");
00217 return -1;
00218 }
00219
00220 int i;
00221 for (i=0; i<count*4; i+=4) {
00222 atompre[i ] = atoms[i ];
00223 atompre[i + 1] = atoms[i + 1];
00224 float dz = zplane - atoms[i + 2];
00225 atompre[i + 2] = dz*dz;
00226 atompre[i + 3] = atoms[i + 3];
00227 }
00228
00229 cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00230
00231 return 0;
00232 }
00233
00234 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00235 long int numplane, long int numcol, long int numpt,
00236 float gridspacing) {
00237 enthrparms parms;
00238 wkf_timerhandle globaltimer;
00239 double totalruntime;
00240 int rc=0;
00241
00242 int numprocs = wkf_thread_numprocessors();
00243 int deviceCount = 0;
00244 if (vmd_cuda_num_devices(&deviceCount))
00245 return -1;
00246 if (deviceCount < 1)
00247 return -1;
00248
00249
00250
00251 if (deviceCount < numprocs) {
00252 numprocs = deviceCount;
00253 }
00254
00255 printf("Using %d CUDA GPUs\n", numprocs);
00256 printf("GPU padded grid size: %d x %d x %d\n",
00257 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00258 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00259 numplane);
00260
00261 parms.atoms = atoms;
00262 parms.grideners = grideners;
00263 parms.numplane = numplane;
00264 parms.numcol = numcol;
00265 parms.numpt = numpt;
00266 parms.natoms = natoms;
00267 parms.gridspacing = gridspacing;
00268
00269 globaltimer = wkf_timer_create();
00270 wkf_timer_start(globaltimer);
00271
00272
00273 wkf_tasktile_t tile;
00274 tile.start=0;
00275 tile.end=numplane;
00276 rc = wkf_threadlaunch(numprocs, &parms, cudaenergythread, &tile);
00277
00278
00279 wkf_timer_stop(globaltimer);
00280 totalruntime = wkf_timer_time(globaltimer);
00281 wkf_timer_destroy(globaltimer);
00282
00283 if (!rc) {
00284 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00285 printf(" %g billion atom evals/second, %g GFLOPS\n",
00286 atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00287 } else {
00288 msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00289 msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00290 }
00291 return rc;
00292 }
00293
00294
00295
00296
00297
00298 static void * cudaenergythread(void *voidparms) {
00299 dim3 volsize, Gsz, Bsz;
00300 float *devenergy = NULL;
00301 float *hostenergy = NULL;
00302 float *atomprebuf = NULL;
00303 enthrparms *parms = NULL;
00304 int threadid=0;
00305 int atomprebufpinned = 0;
00306
00307 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00308 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00309
00310
00311
00312
00313 const float *atoms = parms->atoms;
00314 float* grideners = parms->grideners;
00315 const long int numplane = parms->numplane;
00316 const long int numcol = parms->numcol;
00317 const long int numpt = parms->numpt;
00318 const long int natoms = parms->natoms;
00319 const float gridspacing = parms->gridspacing;
00320 double lasttime, totaltime;
00321
00322 cudaError_t rc;
00323 rc = cudaSetDevice(threadid);
00324 if (rc != cudaSuccess) {
00325 #if CUDART_VERSION >= 2010
00326 rc = cudaGetLastError();
00327 if (rc != cudaErrorSetOnActiveProcess)
00328 return NULL;
00329 #else
00330 cudaGetLastError();
00331
00332 #endif
00333 }
00334
00335
00336 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00337 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00338 volsize.z = 1;
00339
00340
00341 Bsz.x = BLOCKSIZEX;
00342 Bsz.y = BLOCKSIZEY;
00343 Bsz.z = 1;
00344 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00345 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00346 Gsz.z = 1;
00347 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00348
00349 printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00350 wkf_timerhandle timer = wkf_timer_create();
00351 wkf_timer_start(timer);
00352 wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00353
00354
00355
00356 #define DMABUFPADSIZE (32 * 1024)
00357
00358
00359
00360 if (atomprebufpinned) {
00361
00362 if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00363 printf("Pinned atom copy buffer allocation failed!\n");
00364 atomprebufpinned=0;
00365 }
00366 }
00367
00368
00369
00370 if (!atomprebufpinned) {
00371
00372 atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00373 if (atomprebuf == NULL) {
00374 printf("Atom copy buffer allocation failed!\n");
00375 return NULL;
00376 }
00377 }
00378
00379
00380
00381 cudaMalloc((void**)&devenergy, volmemsz);
00382 CUERR
00383
00384 hostenergy = (float *) malloc(volmemsz);
00385
00386
00387 int iterations=0;
00388 int computedplanes=0;
00389 wkf_tasktile_t tile;
00390 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00391 int k;
00392 for (k=tile.start; k<tile.end; k++) {
00393 int y;
00394 int atomstart;
00395 float zplane = k * (float) gridspacing;
00396 computedplanes++;
00397
00398
00399 for (y=0; y<numcol; y++) {
00400 long eneraddr = k*numcol*numpt + y*numpt;
00401 memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00402 }
00403
00404
00405 cudaMemcpy(devenergy, hostenergy, volmemsz, cudaMemcpyHostToDevice);
00406 CUERR
00407
00408 lasttime = wkf_timer_timenow(timer);
00409 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00410 iterations++;
00411 int runatoms;
00412 int atomsremaining = natoms - atomstart;
00413 if (atomsremaining > MAXATOMS)
00414 runatoms = MAXATOMS;
00415 else
00416 runatoms = atomsremaining;
00417
00418
00419 if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00420 return NULL;
00421
00422
00423 cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00424 CUERR
00425 }
00426
00427
00428 cudaMemcpy(hostenergy, devenergy, volmemsz, cudaMemcpyDeviceToHost);
00429 CUERR
00430
00431
00432 for (y=0; y<numcol; y++) {
00433 long eneraddr = k*numcol*numpt + y*numpt;
00434 memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00435 }
00436
00437 totaltime = wkf_timer_timenow(timer);
00438 if (wkf_msg_timer_timeout(msgt)) {
00439
00440 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00441 threadid, k, numplane, computedplanes,
00442 totaltime - lasttime, totaltime,
00443 totaltime * numplane / (k+1));
00444 }
00445 }
00446 }
00447
00448 wkf_timer_destroy(timer);
00449 wkf_msg_timer_destroy(msgt);
00450 free(hostenergy);
00451 cudaFree(devenergy);
00452 CUERR
00453
00454
00455 if (atomprebufpinned) {
00456 if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00457 printf("Pinned atom buffer deallocation failed!\n");
00458 return NULL;
00459 }
00460 } else {
00461 free(atomprebuf);
00462 }
00463 return NULL;
00464 }
00465
00466
00467
00468