00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #if defined(__APPLE__)
00025 #include <OpenCL/cl.h>
00026 #else
00027 #include <CL/cl.h>
00028 #endif
00029
00030 #include "Inform.h"
00031 #include "utilities.h"
00032 #include "WKFThreads.h"
00033 #include "WKFUtils.h"
00034 #include "OpenCLKernels.h"
00035 #include "OpenCLUtils.h"
00036
00037 typedef struct {
00038 float* atoms;
00039 float* grideners;
00040 long int numplane;
00041 long int numcol;
00042 long int numpt;
00043 long int natoms;
00044 float gridspacing;
00045 } enthrparms;
00046
00047
00048 static void * openclenergythread(void *);
00049
00050 #if 1
00051 #define CLERR \
00052 if (clerr != CL_SUCCESS) { \
00053 printf("opencl error %d, %s line %d\n", clerr, __FILE__, __LINE__); \
00054 return NULL; \
00055 }
00056 #else
00057 #define CLERR
00058 #endif
00059
00060
00061
00062
00063
00064 #define MAXATOMS 4000
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090 #define UNROLLX 8
00091 #define UNROLLY 1
00092 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops
00093 #define BLOCKSIZEY 8 // make as small as possible for finer granularity
00094 #define BLOCKSIZE (BLOCKSIZEX * BLOCKSIZEY)
00095
00096 #define V4UNROLLX 8
00097 #define V4UNROLLY 1
00098 #define V4BLOCKSIZEX 8
00099 #define V4BLOCKSIZEY 8
00100 #define V4BLOCKSIZE V4BLOCKSIZEX * V4BLOCKSIZEY
00101
00102
00103 #define FLOPSPERATOMEVAL (59.0/8.0)
00104
00105
00106 const char* clenergysrc =
00107 "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00108 "void clenergy(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) { \n"
00109 " unsigned int xindex = (get_global_id(0) - get_local_id(0)) * UNROLLX + get_local_id(0); \n"
00110 " unsigned int yindex = get_global_id(1); \n"
00111 " unsigned int outaddr = get_global_size(0) * UNROLLX * yindex + xindex;\n"
00112 " \n"
00113 " float coory = gridspacing * yindex; \n"
00114 " float coorx = gridspacing * xindex; \n"
00115 " \n"
00116 " float energyvalx1 = 0.0f; \n"
00117 #if UNROLLX >= 4
00118 " float energyvalx2 = 0.0f; \n"
00119 " float energyvalx3 = 0.0f; \n"
00120 " float energyvalx4 = 0.0f; \n"
00121 #endif
00122 #if UNROLLX == 8
00123 " float energyvalx5 = 0.0f; \n"
00124 " float energyvalx6 = 0.0f; \n"
00125 " float energyvalx7 = 0.0f; \n"
00126 " float energyvalx8 = 0.0f; \n"
00127 #endif
00128 " \n"
00129 " float gridspacing_u = gridspacing * BLOCKSIZEX; \n"
00130 " \n"
00131 " int atomid; \n"
00132 " for (atomid=0; atomid<numatoms; atomid++) { \n"
00133 " float dy = coory - atominfo[atomid].y; \n"
00134 " float dyz2 = (dy * dy) + atominfo[atomid].z; \n"
00135 " \n"
00136 " float dx1 = coorx - atominfo[atomid].x; \n"
00137 #if UNROLLX >= 4
00138 " float dx2 = dx1 + gridspacing_u; \n"
00139 " float dx3 = dx2 + gridspacing_u; \n"
00140 " float dx4 = dx3 + gridspacing_u; \n"
00141 #endif
00142 #if UNROLLX == 8
00143 " float dx5 = dx4 + gridspacing_u; \n"
00144 " float dx6 = dx5 + gridspacing_u; \n"
00145 " float dx7 = dx6 + gridspacing_u; \n"
00146 " float dx8 = dx7 + gridspacing_u; \n"
00147 #endif
00148 " \n"
00149 " energyvalx1 += atominfo[atomid].w * native_rsqrt(dx1*dx1 + dyz2); \n"
00150 #if UNROLLX >= 4
00151 " energyvalx2 += atominfo[atomid].w * native_rsqrt(dx2*dx2 + dyz2); \n"
00152 " energyvalx3 += atominfo[atomid].w * native_rsqrt(dx3*dx3 + dyz2); \n"
00153 " energyvalx4 += atominfo[atomid].w * native_rsqrt(dx4*dx4 + dyz2); \n"
00154 #endif
00155 #if UNROLLX == 8
00156 " energyvalx5 += atominfo[atomid].w * native_rsqrt(dx5*dx5 + dyz2); \n"
00157 " energyvalx6 += atominfo[atomid].w * native_rsqrt(dx6*dx6 + dyz2); \n"
00158 " energyvalx7 += atominfo[atomid].w * native_rsqrt(dx7*dx7 + dyz2); \n"
00159 " energyvalx8 += atominfo[atomid].w * native_rsqrt(dx8*dx8 + dyz2); \n"
00160 #endif
00161 " } \n"
00162 " \n"
00163 " energy[outaddr ] += energyvalx1; \n"
00164 #if UNROLLX >= 4
00165 " energy[outaddr+1*BLOCKSIZEX] += energyvalx2; \n"
00166 " energy[outaddr+2*BLOCKSIZEX] += energyvalx3; \n"
00167 " energy[outaddr+3*BLOCKSIZEX] += energyvalx4; \n"
00168 #endif
00169 #if UNROLLX == 8
00170 " energy[outaddr+4*BLOCKSIZEX] += energyvalx5; \n"
00171 " energy[outaddr+5*BLOCKSIZEX] += energyvalx6; \n"
00172 " energy[outaddr+6*BLOCKSIZEX] += energyvalx7; \n"
00173 " energy[outaddr+7*BLOCKSIZEX] += energyvalx8; \n"
00174 #endif
00175 "} \n"
00176 " \n"
00177 " \n"
00178 " \n"
00179 "__kernel __attribute__((reqd_work_group_size(V4BLOCKSIZEX, V4BLOCKSIZEY, 1))) \n"
00180 "void clenergy_vec4(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) { \n"
00181 " unsigned int xindex = (get_global_id(0) - get_local_id(0)) * V4UNROLLX + get_local_id(0); \n"
00182 " unsigned int yindex = get_global_id(1); \n"
00183 " unsigned int outaddr = get_global_size(0) * V4UNROLLX * yindex + xindex;\n"
00184 " \n"
00185 " float coory = gridspacing * yindex; \n"
00186 " float coorx = gridspacing * xindex; \n"
00187 " \n"
00188 " float4 energyvalx = 0.f; \n"
00189 #if V4UNROLLX == 8
00190 " float4 energyvalx2 = 0.f; \n"
00191 #endif
00192 " \n"
00193 " float4 gridspacing_u4 = { 0.f, 1.f, 2.f, 3.f }; \n"
00194 " gridspacing_u4 *= gridspacing * V4BLOCKSIZEX; \n"
00195 " \n"
00196 " int atomid; \n"
00197 " for (atomid=0; atomid<numatoms; atomid++) { \n"
00198 " float dy = coory - atominfo[atomid].y; \n"
00199 " float dyz2 = (dy * dy) + atominfo[atomid].z; \n"
00200 " \n"
00201 " float4 dx = gridspacing_u4 + (coorx - atominfo[atomid].x); \n"
00202 " energyvalx += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2); \n"
00203 #if V4UNROLLX == 8
00204 " dx += (4.0f * V4BLOCKSIZEX); \n"
00205 " energyvalx2 += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2); \n"
00206 #endif
00207 " } \n"
00208 " \n"
00209 " energy[outaddr ] += energyvalx.x; \n"
00210 " energy[outaddr+1*V4BLOCKSIZEX] += energyvalx.y; \n"
00211 " energy[outaddr+2*V4BLOCKSIZEX] += energyvalx.z; \n"
00212 " energy[outaddr+3*V4BLOCKSIZEX] += energyvalx.w; \n"
00213 #if V4UNROLLX == 8
00214 " energy[outaddr+4*V4BLOCKSIZEX] += energyvalx2.x; \n"
00215 " energy[outaddr+5*V4BLOCKSIZEX] += energyvalx2.y; \n"
00216 " energy[outaddr+6*V4BLOCKSIZEX] += energyvalx2.z; \n"
00217 " energy[outaddr+7*V4BLOCKSIZEX] += energyvalx2.w; \n"
00218 #endif
00219 "} \n"
00220 " \n";
00221
00222
00223
00224
00225 #define TILESIZEX BLOCKSIZEX*UNROLLX
00226 #define TILESIZEY BLOCKSIZEY*UNROLLY
00227 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00228 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00229
00230 #define V4TILESIZEX V4BLOCKSIZEX*V4UNROLLX
00231 #define V4TILESIZEY V4BLOCKSIZEY*V4UNROLLY
00232 #define V4GPU_X_ALIGNMASK (V4TILESIZEX - 1)
00233 #define V4GPU_Y_ALIGNMASK (V4TILESIZEY - 1)
00234
00235 static int copyatomstoconstbuf(cl_command_queue clcmdq, cl_mem datominfo,
00236 const float *atoms, int count, float zplane) {
00237 if (count > MAXATOMS) {
00238 printf("Atom count exceeds constant buffer storage capacity\n");
00239 return -1;
00240 }
00241
00242 float atompre[4*MAXATOMS];
00243 int i;
00244 for (i=0; i<count*4; i+=4) {
00245 atompre[i ] = atoms[i ];
00246 atompre[i + 1] = atoms[i + 1];
00247 float dz = zplane - atoms[i + 2];
00248 atompre[i + 2] = dz*dz;
00249 atompre[i + 3] = atoms[i + 3];
00250 }
00251
00252 cl_int clerr = CL_SUCCESS;
00253 clerr = clEnqueueWriteBuffer(clcmdq, datominfo, CL_TRUE, 0, count * sizeof(cl_float4), (void *) atompre, 0, NULL, NULL);
00254
00255
00256 return 0;
00257 }
00258
00259
00260 int vmd_opencl_vol_cpotential(long int natoms, float* atoms, float* grideners,
00261 long int numplane, long int numcol, long int numpt,
00262 float gridspacing) {
00263 enthrparms parms;
00264 wkf_timerhandle globaltimer;
00265 double totalruntime;
00266 int rc=0;
00267 int deviceCount = 1;
00268 int numprocs = 1;
00269
00270
00271
00272 if (deviceCount < numprocs) {
00273 numprocs = deviceCount;
00274 }
00275
00276 printf("Using %d OpenCL devices\n", numprocs);
00277 int usevec4=0;
00278 if (getenv("VMDDCSVEC4")!=NULL)
00279 usevec4=1;
00280
00281 if (usevec4) {
00282 printf("OpenCL padded grid size: %ld x %ld x %ld\n",
00283 (numpt + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK),
00284 (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK),
00285 numplane);
00286 } else {
00287 printf("OpenCL padded grid size: %ld x %ld x %ld\n",
00288 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00289 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00290 numplane);
00291 }
00292
00293 parms.atoms = atoms;
00294 parms.grideners = grideners;
00295 parms.numplane = numplane;
00296 parms.numcol = numcol;
00297 parms.numpt = numpt;
00298 parms.natoms = natoms;
00299 parms.gridspacing = gridspacing;
00300
00301 globaltimer = wkf_timer_create();
00302 wkf_timer_start(globaltimer);
00303
00304
00305 wkf_tasktile_t tile;
00306 tile.start=0;
00307 tile.end=numplane;
00308 rc = wkf_threadlaunch(numprocs, &parms, openclenergythread, &tile);
00309
00310
00311 wkf_timer_stop(globaltimer);
00312 totalruntime = wkf_timer_time(globaltimer);
00313 wkf_timer_destroy(globaltimer);
00314
00315 if (!rc) {
00316 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00317 printf(" %g billion atom evals/second, %g GFLOPS\n",
00318 atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00319 } else {
00320 msgWarn << "An OpenCL device encountered an unrecoverable error." << sendmsg;
00321 msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00322 }
00323 return rc;
00324 }
00325
00326
00327 cl_program vmd_opencl_compile_volcpotential_pgm(cl_context clctx, cl_device_id *cldevs, int &clerr) {
00328 cl_program clpgm = NULL;
00329
00330 clpgm = clCreateProgramWithSource(clctx, 1, &clenergysrc, NULL, &clerr);
00331 CLERR
00332
00333 char clcompileflags[4096];
00334 sprintf(clcompileflags,
00335 "-DUNROLLX=%d -DUNROLLY=%d -DBLOCKSIZEX=%d -DBLOCKSIZEY=%d -DBLOCKSIZE=%d "
00336 "-DV4UNROLLX=%d -DV4UNROLLY=%d -DV4BLOCKSIZEX=%d -DV4BLOCKSIZEY=%d -DV4BLOCKSIZE=%d "
00337 "-cl-fast-relaxed-math -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros",
00338 UNROLLX, UNROLLY, BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZE,
00339 V4UNROLLX, V4UNROLLY, V4BLOCKSIZEX, V4BLOCKSIZEY, V4BLOCKSIZE);
00340
00341 clerr = clBuildProgram(clpgm, 0, NULL, clcompileflags, NULL, NULL);
00342 if (clerr != CL_SUCCESS)
00343 printf(" compilation failed!\n");
00344
00345 if (cldevs) {
00346 char buildlog[8192];
00347 size_t len=0;
00348 clerr = clGetProgramBuildInfo(clpgm, cldevs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildlog), buildlog, &len);
00349 if (len > 1) {
00350 printf("OpenCL compilation log:\n");
00351 printf(" '%s'\n", buildlog);
00352 }
00353 CLERR
00354 }
00355
00356 return clpgm;
00357 }
00358
00359
00360
00361 static void * openclenergythread(void *voidparms) {
00362 size_t volsize[3], Gsz[3], Bsz[3];
00363 cl_int clerr = CL_SUCCESS;
00364 cl_mem devenergy = NULL;
00365 cl_mem datominfo = NULL;
00366 float *hostenergy = NULL;
00367 enthrparms *parms = NULL;
00368
00369 int threadid=0;
00370
00371 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00372 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00373
00374
00375
00376
00377 const float *atoms = parms->atoms;
00378 float* grideners = parms->grideners;
00379 const long int numplane = parms->numplane;
00380 const long int numcol = parms->numcol;
00381 const long int numpt = parms->numpt;
00382 const long int natoms = parms->natoms;
00383 const float gridspacing = parms->gridspacing;
00384 double lasttime, totaltime;
00385
00386 printf("OpenCL worker[%d] initializing...\n", threadid);
00387 cl_platform_id clplatid = vmd_cl_get_platform_index(0);
00388 cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0};
00389 #if 0
00390
00391
00392
00393
00394
00395
00396
00397
00398 cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_CPU, NULL, NULL, &clerr);
00399 #else
00400 cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr);
00401 #endif
00402 CLERR
00403
00404 size_t parmsz;
00405 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz);
00406 CLERR
00407
00408 cl_device_id* cldevs = (cl_device_id *) malloc(parmsz);
00409 clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL);
00410 CLERR
00411
00412 cl_command_queue clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr);
00413 CLERR
00414
00415 cl_program clpgm = vmd_opencl_compile_volcpotential_pgm(clctx, cldevs, clerr);
00416 CLERR
00417
00418 cl_kernel clenergy = clCreateKernel(clpgm, "clenergy", &clerr);
00419 cl_kernel clenergyvec4 = clCreateKernel(clpgm, "clenergy_vec4", &clerr);
00420 CLERR
00421 printf("OpenCL worker[%d] ready.\n", threadid);
00422
00423
00424 int usevec4=0;
00425 if (getenv("VMDDCSVEC4")!=NULL)
00426 usevec4=1;
00427
00428 if (usevec4) {
00429
00430 volsize[0] = (numpt + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK);
00431 volsize[1] = (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK);
00432 volsize[2] = 1;
00433 Bsz[0] = V4BLOCKSIZEX;
00434 Bsz[1] = V4BLOCKSIZEY;
00435 Bsz[2] = 1;
00436 Gsz[0] = volsize[0] / V4UNROLLX;
00437 Gsz[1] = volsize[1] / V4UNROLLY;
00438 Gsz[2] = volsize[2];
00439 } else {
00440
00441 volsize[0] = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00442 volsize[1] = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00443 volsize[2] = 1;
00444 Bsz[0] = BLOCKSIZEX;
00445 Bsz[1] = BLOCKSIZEY;
00446 Bsz[2] = 1;
00447 Gsz[0] = volsize[0] / UNROLLX;
00448 Gsz[1] = volsize[1] / UNROLLY;
00449 Gsz[2] = volsize[2];
00450 }
00451
00452
00453 int volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
00454
00455 printf("Thread %d started for OpenCL device %d...\n", threadid, threadid);
00456 wkf_timerhandle timer = wkf_timer_create();
00457 wkf_timer_start(timer);
00458 wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00459
00460
00461
00462 #define DMABUFPADSIZE (32 * 1024)
00463
00464 hostenergy = (float *) malloc(volmemsz);
00465
00466 devenergy = clCreateBuffer(clctx, CL_MEM_READ_WRITE, volmemsz, NULL, NULL);
00467 CLERR
00468
00469 datominfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, MAXATOMS * sizeof(cl_float4), NULL, NULL);
00470 CLERR
00471
00472
00473
00474 int iterations=0;
00475 int computedplanes=0;
00476 wkf_tasktile_t tile;
00477 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00478 int k;
00479 for (k=tile.start; k<tile.end; k++) {
00480 int y;
00481 int atomstart;
00482 float zplane = k * (float) gridspacing;
00483 computedplanes++;
00484
00485
00486 for (y=0; y<numcol; y++) {
00487 long eneraddr = k*numcol*numpt + y*numpt;
00488 memcpy(&hostenergy[y*volsize[0]], &grideners[eneraddr], numpt * sizeof(float));
00489 }
00490
00491
00492 clEnqueueWriteBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy, 0, NULL, NULL);
00493 CLERR
00494
00495 lasttime = wkf_timer_timenow(timer);
00496 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00497 iterations++;
00498 int runatoms;
00499 int atomsremaining = natoms - atomstart;
00500 if (atomsremaining > MAXATOMS)
00501 runatoms = MAXATOMS;
00502 else
00503 runatoms = atomsremaining;
00504
00505
00506 if (copyatomstoconstbuf(clcmdq, datominfo,
00507 atoms + 4*atomstart, runatoms, zplane))
00508 return NULL;
00509
00510 cl_kernel clkern;
00511 if (usevec4)
00512 clkern = clenergyvec4;
00513 else
00514 clkern = clenergy;
00515
00516
00517 clerr |= clSetKernelArg(clkern, 0, sizeof(int), &runatoms);
00518 clerr |= clSetKernelArg(clkern, 1, sizeof(float), &gridspacing);
00519 clerr |= clSetKernelArg(clkern, 2, sizeof(cl_mem), &devenergy);
00520 clerr |= clSetKernelArg(clkern, 3, sizeof(cl_mem), &datominfo);
00521 CLERR
00522 cl_event event;
00523 #if 0
00524 printf("Gsz: %ld %ld %ld Bsz: %ld %ld %ld\n",
00525 Gsz[0], Gsz[1], Gsz[2], Bsz[0], Bsz[1], Bsz[2]);
00526 #endif
00527 clerr |= clEnqueueNDRangeKernel(clcmdq, clkern, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00528 CLERR
00529
00530 clerr |= clWaitForEvents(1, &event);
00531 clerr |= clReleaseEvent(event);
00532 CLERR
00533 }
00534 clFinish(clcmdq);
00535
00536
00537 clEnqueueReadBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy,
00538 0, NULL, NULL);
00539
00540 CLERR
00541
00542
00543 for (y=0; y<numcol; y++) {
00544 long eneraddr = k*numcol*numpt + y*numpt;
00545 memcpy(&grideners[eneraddr], &hostenergy[y*volsize[0]], numpt * sizeof(float));
00546 }
00547
00548 totaltime = wkf_timer_timenow(timer);
00549 if (wkf_msg_timer_timeout(msgt)) {
00550
00551 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00552 threadid, k, numplane, computedplanes,
00553 totaltime - lasttime, totaltime,
00554 totaltime * numplane / (k+1));
00555 }
00556 }
00557 }
00558
00559 wkf_timer_destroy(timer);
00560 wkf_msg_timer_destroy(msgt);
00561 free(hostenergy);
00562
00563 printf("destroying context, programs, etc\n");
00564 clReleaseMemObject(devenergy);
00565 clReleaseMemObject(datominfo);
00566 clReleaseKernel(clenergy);
00567 clReleaseKernel(clenergyvec4);
00568 clReleaseProgram(clpgm);
00569 clReleaseCommandQueue(clcmdq);
00570 clReleaseContext(clctx);
00571 printf("done.\n");
00572
00573 CLERR
00574
00575 return NULL;
00576 }
00577
00578
00579
00580