OpenCLVolCPotential.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: OpenCLVolCPotential.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.32 $      $Date: 2019/01/17 21:38:55 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   OpenCL accelerated coulombic potential grid calculation
00019  *
00020  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #if defined(__APPLE__)
00025 #include <OpenCL/cl.h>
00026 #else
00027 #include <CL/cl.h>
00028 #endif
00029 
00030 #include "Inform.h"
00031 #include "utilities.h"
00032 #include "WKFThreads.h"
00033 #include "WKFUtils.h"
00034 #include "OpenCLKernels.h"
00035 #include "OpenCLUtils.h"
00036 
00037 typedef struct {
00038   float* atoms;
00039   float* grideners;
00040   long int numplane;
00041   long int numcol;
00042   long int numpt;
00043   long int natoms;
00044   float gridspacing;
00045 } enthrparms;
00046 
00047 /* thread prototype */
00048 static void * openclenergythread(void *);
00049 
00050 #if 1
00051 #define CLERR \
00052   if (clerr != CL_SUCCESS) {                     \
00053     printf("opencl error %d, %s line %d\n", clerr, __FILE__, __LINE__); \
00054     return NULL;                                   \
00055   }
00056 #else
00057 #define CLERR
00058 #endif
00059 
00060 // max constant buffer size is 64KB, minus whatever
00061 // the OpenCL runtime and compiler are using that we don't know about
00062 // At 16 bytes/atom, 4000 atoms is about the max we can store in
00063 // the constant buffer.
00064 #define MAXATOMS 4000
00065 
00066 
00067 // 
00068 // The OpenCL kernels calculate coulombic potential at each grid point and
00069 // store the results in the output array.
00070 //
00071 // These versions of the code use the 64KB constant buffer area reloaded
00072 // for each group of MAXATOMS atoms, until the contributions from all
00073 // atoms have been summed into the potential grid.
00074 //
00075 // These implementations use precomputed and unrolled loops of 
00076 // (dy^2 + dz^2) values for increased FP arithmetic intensity.
00077 // The X coordinate portion of the loop is unrolled by four or eight,
00078 // allowing the same dy^2 + dz^2 values to be reused multiple times,
00079 // increasing the ratio of FP arithmetic relative to FP loads, and
00080 // eliminating some redundant calculations.
00081 //
00082 
00083 //
00084 // Tuned global memory coalescing version, unrolled in X
00085 //
00086 
00087 //
00088 // Tunings for large potential map dimensions (e.g. 384x384x...)
00089 //
00090 #define UNROLLX       8
00091 #define UNROLLY       1
00092 #define BLOCKSIZEX    8  // make large enough to allow coalesced global mem ops
00093 #define BLOCKSIZEY    8  // make as small as possible for finer granularity
00094 #define BLOCKSIZE    (BLOCKSIZEX * BLOCKSIZEY)
00095 
00096 #define V4UNROLLX       8
00097 #define V4UNROLLY       1
00098 #define V4BLOCKSIZEX    8
00099 #define V4BLOCKSIZEY    8
00100 #define V4BLOCKSIZE    V4BLOCKSIZEX * V4BLOCKSIZEY
00101 
00102 // FLOP counting
00103 #define FLOPSPERATOMEVAL (59.0/8.0)
00104 
00105 // OpenCL source code
00106 const char* clenergysrc =
00107   "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00108   "void clenergy(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) {                        \n"
00109   "  unsigned int xindex  = (get_global_id(0) - get_local_id(0)) * UNROLLX + get_local_id(0); \n"
00110   "  unsigned int yindex  = get_global_id(1);                              \n"
00111   "  unsigned int outaddr = get_global_size(0) * UNROLLX * yindex + xindex;\n"
00112   "                                                                        \n"
00113   "  float coory = gridspacing * yindex;                                   \n"
00114   "  float coorx = gridspacing * xindex;                                   \n"
00115   "                                                                        \n"
00116   "  float energyvalx1 = 0.0f;                                             \n"
00117 #if UNROLLX >= 4
00118   "  float energyvalx2 = 0.0f;                                             \n"
00119   "  float energyvalx3 = 0.0f;                                             \n"
00120   "  float energyvalx4 = 0.0f;                                             \n"
00121 #endif
00122 #if UNROLLX == 8
00123   "  float energyvalx5 = 0.0f;                                             \n"
00124   "  float energyvalx6 = 0.0f;                                             \n"
00125   "  float energyvalx7 = 0.0f;                                             \n"
00126   "  float energyvalx8 = 0.0f;                                             \n"
00127 #endif
00128   "                                                                        \n"
00129   "  float gridspacing_u = gridspacing * BLOCKSIZEX;                       \n"
00130   "                                                                        \n"
00131   "  int atomid;                                                           \n"
00132   "  for (atomid=0; atomid<numatoms; atomid++) {                           \n"
00133   "    float dy = coory - atominfo[atomid].y;                              \n"
00134   "    float dyz2 = (dy * dy) + atominfo[atomid].z;                        \n"
00135   "                                                                        \n"
00136   "    float dx1 = coorx - atominfo[atomid].x;                             \n"
00137 #if UNROLLX >= 4
00138   "    float dx2 = dx1 + gridspacing_u;                                    \n"
00139   "    float dx3 = dx2 + gridspacing_u;                                    \n"
00140   "    float dx4 = dx3 + gridspacing_u;                                    \n"
00141 #endif
00142 #if UNROLLX == 8
00143   "    float dx5 = dx4 + gridspacing_u;                                    \n"
00144   "    float dx6 = dx5 + gridspacing_u;                                    \n"
00145   "    float dx7 = dx6 + gridspacing_u;                                    \n"
00146   "    float dx8 = dx7 + gridspacing_u;                                    \n"
00147 #endif
00148   "                                                                        \n"
00149   "    energyvalx1 += atominfo[atomid].w * native_rsqrt(dx1*dx1 + dyz2);   \n"
00150 #if UNROLLX >= 4
00151   "    energyvalx2 += atominfo[atomid].w * native_rsqrt(dx2*dx2 + dyz2);   \n"
00152   "    energyvalx3 += atominfo[atomid].w * native_rsqrt(dx3*dx3 + dyz2);   \n"
00153   "    energyvalx4 += atominfo[atomid].w * native_rsqrt(dx4*dx4 + dyz2);   \n"
00154 #endif
00155 #if UNROLLX == 8
00156   "    energyvalx5 += atominfo[atomid].w * native_rsqrt(dx5*dx5 + dyz2);   \n"
00157   "    energyvalx6 += atominfo[atomid].w * native_rsqrt(dx6*dx6 + dyz2);   \n"
00158   "    energyvalx7 += atominfo[atomid].w * native_rsqrt(dx7*dx7 + dyz2);   \n"
00159   "    energyvalx8 += atominfo[atomid].w * native_rsqrt(dx8*dx8 + dyz2);   \n"
00160 #endif
00161   "  }                                                                     \n"
00162   "                                                                        \n"
00163   "  energy[outaddr             ] += energyvalx1;                          \n"
00164 #if UNROLLX >= 4
00165   "  energy[outaddr+1*BLOCKSIZEX] += energyvalx2;                          \n"
00166   "  energy[outaddr+2*BLOCKSIZEX] += energyvalx3;                          \n"
00167   "  energy[outaddr+3*BLOCKSIZEX] += energyvalx4;                          \n"
00168 #endif
00169 #if UNROLLX == 8
00170   "  energy[outaddr+4*BLOCKSIZEX] += energyvalx5;                          \n"
00171   "  energy[outaddr+5*BLOCKSIZEX] += energyvalx6;                          \n"
00172   "  energy[outaddr+6*BLOCKSIZEX] += energyvalx7;                          \n"
00173   "  energy[outaddr+7*BLOCKSIZEX] += energyvalx8;                          \n"
00174 #endif
00175   "}                                                                       \n"
00176   "                                                                        \n"
00177   "                                                                        \n"
00178   "                                                                        \n"
00179   "__kernel __attribute__((reqd_work_group_size(V4BLOCKSIZEX, V4BLOCKSIZEY, 1))) \n"
00180   "void clenergy_vec4(int numatoms, float gridspacing, __global float *energy, __constant float4 *atominfo) {                        \n"
00181   "  unsigned int xindex  = (get_global_id(0) - get_local_id(0)) * V4UNROLLX + get_local_id(0); \n"
00182   "  unsigned int yindex  = get_global_id(1);                              \n"
00183   "  unsigned int outaddr = get_global_size(0) * V4UNROLLX * yindex + xindex;\n"
00184   "                                                                        \n"
00185   "  float coory = gridspacing * yindex;                                   \n"
00186   "  float coorx = gridspacing * xindex;                                   \n"
00187   "                                                                        \n"
00188   "  float4 energyvalx = 0.f;                                              \n"
00189 #if V4UNROLLX == 8
00190   "  float4 energyvalx2 = 0.f;                                             \n"
00191 #endif
00192   "                                                                        \n"
00193   "  float4 gridspacing_u4 = { 0.f, 1.f, 2.f, 3.f };                       \n"
00194   "  gridspacing_u4 *= gridspacing * V4BLOCKSIZEX;                         \n"
00195   "                                                                        \n"
00196   "  int atomid;                                                           \n"
00197   "  for (atomid=0; atomid<numatoms; atomid++) {                           \n"
00198   "    float dy = coory - atominfo[atomid].y;                              \n"
00199   "    float dyz2 = (dy * dy) + atominfo[atomid].z;                        \n"
00200   "                                                                        \n"
00201   "    float4 dx = gridspacing_u4 + (coorx - atominfo[atomid].x);          \n"
00202   "    energyvalx += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2);      \n"
00203 #if V4UNROLLX == 8
00204   "    dx += (4.0f * V4BLOCKSIZEX);                                        \n"
00205   "    energyvalx2 += atominfo[atomid].w * native_rsqrt(dx*dx + dyz2);     \n"
00206 #endif
00207   "  }                                                                     \n"
00208   "                                                                        \n"
00209   "  energy[outaddr               ] += energyvalx.x;                       \n"
00210   "  energy[outaddr+1*V4BLOCKSIZEX] += energyvalx.y;                       \n"
00211   "  energy[outaddr+2*V4BLOCKSIZEX] += energyvalx.z;                       \n"
00212   "  energy[outaddr+3*V4BLOCKSIZEX] += energyvalx.w;                       \n"
00213 #if V4UNROLLX == 8
00214   "  energy[outaddr+4*V4BLOCKSIZEX] += energyvalx2.x;                      \n"
00215   "  energy[outaddr+5*V4BLOCKSIZEX] += energyvalx2.y;                      \n"
00216   "  energy[outaddr+6*V4BLOCKSIZEX] += energyvalx2.z;                      \n"
00217   "  energy[outaddr+7*V4BLOCKSIZEX] += energyvalx2.w;                      \n"
00218 #endif
00219   "}                                                                       \n"
00220   "                                                                        \n";
00221 
00222 
00223 // required GPU array padding to match thread block size
00224 // XXX note: this code requires block size dimensions to be a power of two
00225 #define TILESIZEX BLOCKSIZEX*UNROLLX
00226 #define TILESIZEY BLOCKSIZEY*UNROLLY
00227 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00228 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00229 
00230 #define V4TILESIZEX V4BLOCKSIZEX*V4UNROLLX
00231 #define V4TILESIZEY V4BLOCKSIZEY*V4UNROLLY
00232 #define V4GPU_X_ALIGNMASK (V4TILESIZEX - 1)
00233 #define V4GPU_Y_ALIGNMASK (V4TILESIZEY - 1)
00234 
00235 static int copyatomstoconstbuf(cl_command_queue clcmdq, cl_mem datominfo,
00236                         const float *atoms, int count, float zplane) {
00237   if (count > MAXATOMS) {
00238     printf("Atom count exceeds constant buffer storage capacity\n");
00239     return -1;
00240   }
00241 
00242   float atompre[4*MAXATOMS];
00243   int i;
00244   for (i=0; i<count*4; i+=4) {
00245     atompre[i    ] = atoms[i    ];
00246     atompre[i + 1] = atoms[i + 1];
00247     float dz = zplane - atoms[i + 2];
00248     atompre[i + 2]  = dz*dz;
00249     atompre[i + 3] = atoms[i + 3];
00250   }
00251 
00252   cl_int clerr = CL_SUCCESS;
00253   clerr = clEnqueueWriteBuffer(clcmdq, datominfo, CL_TRUE, 0, count * sizeof(cl_float4), (void *) atompre, 0, NULL, NULL);
00254 //  CLERR
00255 
00256   return 0;
00257 }
00258 
00259 
00260 int vmd_opencl_vol_cpotential(long int natoms, float* atoms, float* grideners,
00261                             long int numplane, long int numcol, long int numpt, 
00262                             float gridspacing) {
00263   enthrparms parms;
00264   wkf_timerhandle globaltimer;
00265   double totalruntime;
00266   int rc=0;
00267   int deviceCount = 1; // hard coded for now
00268   int numprocs = 1; // hard coded for now
00269 
00270   /* take the lesser of the number of CPUs and GPUs */
00271   /* and execute that many threads                  */
00272   if (deviceCount < numprocs) {
00273     numprocs = deviceCount;
00274   }
00275 
00276   printf("Using %d OpenCL devices\n", numprocs);
00277   int usevec4=0;
00278   if (getenv("VMDDCSVEC4")!=NULL)
00279     usevec4=1;
00280 
00281   if (usevec4) {
00282     printf("OpenCL padded grid size: %ld x %ld x %ld\n", 
00283       (numpt  + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK),
00284       (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK),
00285       numplane);
00286   } else {
00287     printf("OpenCL padded grid size: %ld x %ld x %ld\n", 
00288       (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00289       (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00290       numplane);
00291   }
00292 
00293   parms.atoms = atoms;
00294   parms.grideners = grideners;
00295   parms.numplane = numplane;
00296   parms.numcol = numcol;
00297   parms.numpt = numpt;
00298   parms.natoms = natoms;
00299   parms.gridspacing = gridspacing;
00300 
00301   globaltimer = wkf_timer_create();
00302   wkf_timer_start(globaltimer);
00303 
00304   /* spawn child threads to do the work */
00305   wkf_tasktile_t tile;
00306   tile.start=0;
00307   tile.end=numplane;
00308   rc = wkf_threadlaunch(numprocs, &parms, openclenergythread, &tile);
00309 
00310   // Measure GFLOPS
00311   wkf_timer_stop(globaltimer);
00312   totalruntime = wkf_timer_time(globaltimer);
00313   wkf_timer_destroy(globaltimer);
00314 
00315   if (!rc) {
00316     double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00317     printf("  %g billion atom evals/second, %g GFLOPS\n",
00318            atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00319   } else {
00320     msgWarn << "An OpenCL device encountered an unrecoverable error." << sendmsg;
00321     msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00322   }
00323   return rc;
00324 }
00325 
00326 
00327 cl_program vmd_opencl_compile_volcpotential_pgm(cl_context clctx, cl_device_id *cldevs, int &clerr) {
00328   cl_program clpgm = NULL;
00329 
00330   clpgm = clCreateProgramWithSource(clctx, 1, &clenergysrc, NULL, &clerr);
00331   CLERR
00332 
00333   char clcompileflags[4096];
00334   sprintf(clcompileflags,
00335           "-DUNROLLX=%d -DUNROLLY=%d -DBLOCKSIZEX=%d -DBLOCKSIZEY=%d -DBLOCKSIZE=%d "
00336           "-DV4UNROLLX=%d -DV4UNROLLY=%d -DV4BLOCKSIZEX=%d -DV4BLOCKSIZEY=%d -DV4BLOCKSIZE=%d "
00337           "-cl-fast-relaxed-math -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros",
00338           UNROLLX, UNROLLY, BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZE,
00339           V4UNROLLX, V4UNROLLY, V4BLOCKSIZEX, V4BLOCKSIZEY, V4BLOCKSIZE);
00340 
00341   clerr = clBuildProgram(clpgm, 0, NULL, clcompileflags, NULL, NULL);
00342   if (clerr != CL_SUCCESS)
00343     printf("  compilation failed!\n");
00344 
00345   if (cldevs) {
00346     char buildlog[8192];
00347     size_t len=0;
00348     clerr = clGetProgramBuildInfo(clpgm, cldevs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildlog), buildlog, &len);
00349     if (len > 1) {
00350       printf("OpenCL compilation log:\n");
00351       printf("  '%s'\n", buildlog);
00352     }
00353     CLERR
00354   }
00355 
00356   return clpgm;
00357 }
00358 
00359 
00360 
00361 static void * openclenergythread(void *voidparms) {
00362   size_t volsize[3], Gsz[3], Bsz[3];
00363   cl_int clerr = CL_SUCCESS;
00364   cl_mem devenergy = NULL;
00365   cl_mem datominfo = NULL;
00366   float *hostenergy = NULL;
00367   enthrparms *parms = NULL;
00368 
00369   int threadid=0;
00370 
00371   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00372   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00373 
00374   /* 
00375    * copy in per-thread parameters 
00376    */
00377   const float *atoms = parms->atoms;
00378   float* grideners = parms->grideners;
00379   const long int numplane = parms->numplane;
00380   const long int numcol = parms->numcol;
00381   const long int numpt = parms->numpt;
00382   const long int natoms = parms->natoms;
00383   const float gridspacing = parms->gridspacing;
00384   double lasttime, totaltime;
00385 
00386 printf("OpenCL worker[%d] initializing...\n", threadid);
00387   cl_platform_id clplatid = vmd_cl_get_platform_index(0);
00388   cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0};
00389 #if 0
00390   // 
00391   // On the IBM "Blue Drop" Power 775 supercomputer, there are no GPUs, but
00392   // by using OpenCL on the CPU device type, we can better exploit the 
00393   // vector units.  The final NSF/NCSA Blue Waters machine ended up being
00394   // a Cray XE6/XK7, so this code isn't relevant for production use 
00395   // currently, but there may be other cases where this strategy will 
00396   // be useful in the future.
00397   // 
00398   cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_CPU, NULL, NULL, &clerr);
00399 #else
00400   cl_context clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr);
00401 #endif
00402   CLERR
00403 
00404   size_t parmsz;
00405   clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz);
00406   CLERR
00407 
00408   cl_device_id* cldevs = (cl_device_id *) malloc(parmsz);
00409   clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL);
00410   CLERR
00411 
00412   cl_command_queue clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr);
00413   CLERR
00414 
00415   cl_program clpgm = vmd_opencl_compile_volcpotential_pgm(clctx, cldevs, clerr);
00416   CLERR
00417 
00418   cl_kernel clenergy = clCreateKernel(clpgm, "clenergy", &clerr);
00419   cl_kernel clenergyvec4 = clCreateKernel(clpgm, "clenergy_vec4", &clerr);
00420   CLERR
00421 printf("OpenCL worker[%d] ready.\n", threadid);
00422 
00423   // setup OpenCL grid and block sizes
00424   int usevec4=0;
00425   if (getenv("VMDDCSVEC4")!=NULL)
00426     usevec4=1;
00427 
00428   if (usevec4) {
00429     // setup energy grid size, padding arrays for peak GPU memory performance
00430     volsize[0] = (numpt  + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK);
00431     volsize[1] = (numcol + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK);
00432     volsize[2] = 1;      // we only do one plane at a time
00433     Bsz[0] = V4BLOCKSIZEX;
00434     Bsz[1] = V4BLOCKSIZEY;
00435     Bsz[2] = 1;
00436     Gsz[0] = volsize[0] / V4UNROLLX;
00437     Gsz[1] = volsize[1] / V4UNROLLY;
00438     Gsz[2] = volsize[2];
00439   } else {
00440     // setup energy grid size, padding arrays for peak GPU memory performance
00441     volsize[0] = (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00442     volsize[1] = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00443     volsize[2] = 1;      // we only do one plane at a time
00444     Bsz[0] = BLOCKSIZEX;
00445     Bsz[1] = BLOCKSIZEY;
00446     Bsz[2] = 1;
00447     Gsz[0] = volsize[0] / UNROLLX;
00448     Gsz[1] = volsize[1] / UNROLLY;
00449     Gsz[2] = volsize[2];
00450   }
00451 
00452 
00453   int volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
00454 
00455   printf("Thread %d started for OpenCL device %d...\n", threadid, threadid);
00456   wkf_timerhandle timer = wkf_timer_create();
00457   wkf_timer_start(timer);
00458   wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00459 
00460   // Allocate DMA buffers with some extra padding at the end so that 
00461   // multiple OpenCL devices aren't DMAing too close to each other, for NUMA..
00462 #define DMABUFPADSIZE (32 * 1024)
00463 
00464   hostenergy = (float *) malloc(volmemsz); // allocate working buffer
00465 
00466   devenergy = clCreateBuffer(clctx, CL_MEM_READ_WRITE, volmemsz, NULL, NULL);
00467   CLERR
00468 
00469   datominfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, MAXATOMS * sizeof(cl_float4), NULL, NULL);
00470   CLERR
00471 
00472 
00473   // For each point in the cube...
00474   int iterations=0;
00475   int computedplanes=0;
00476   wkf_tasktile_t tile;
00477   while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00478     int k;
00479     for (k=tile.start; k<tile.end; k++) {
00480       int y;
00481       int atomstart;
00482       float zplane = k * (float) gridspacing;
00483       computedplanes++; // track work done by this GPU for progress reporting
00484  
00485       // Copy energy grid into GPU 16-element padded input
00486       for (y=0; y<numcol; y++) {
00487         long eneraddr = k*numcol*numpt + y*numpt;
00488         memcpy(&hostenergy[y*volsize[0]], &grideners[eneraddr], numpt * sizeof(float));
00489       }
00490 
00491       // Copy the Host input data to the GPU..
00492       clEnqueueWriteBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy, 0, NULL, NULL);
00493       CLERR // check and clear any existing errors
00494 
00495       lasttime = wkf_timer_timenow(timer);
00496       for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00497         iterations++;
00498         int runatoms;
00499         int atomsremaining = natoms - atomstart;
00500         if (atomsremaining > MAXATOMS)
00501           runatoms = MAXATOMS;
00502         else
00503           runatoms = atomsremaining;
00504 
00505         // copy the next group of atoms to the GPU
00506         if (copyatomstoconstbuf(clcmdq, datominfo,
00507                                 atoms + 4*atomstart, runatoms, zplane))
00508           return NULL;
00509 
00510         cl_kernel clkern;
00511         if (usevec4)
00512           clkern = clenergyvec4;
00513         else
00514           clkern = clenergy;
00515 
00516         // RUN the kernel...
00517         clerr |= clSetKernelArg(clkern, 0, sizeof(int), &runatoms);
00518         clerr |= clSetKernelArg(clkern, 1, sizeof(float), &gridspacing);
00519         clerr |= clSetKernelArg(clkern, 2, sizeof(cl_mem), &devenergy);
00520         clerr |= clSetKernelArg(clkern, 3, sizeof(cl_mem), &datominfo);
00521         CLERR
00522         cl_event event;
00523 #if 0
00524 printf("Gsz: %ld %ld %ld  Bsz: %ld %ld %ld\n",
00525   Gsz[0], Gsz[1], Gsz[2], Bsz[0], Bsz[1], Bsz[2]);
00526 #endif
00527         clerr |= clEnqueueNDRangeKernel(clcmdq, clkern, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00528         CLERR
00529 
00530         clerr |= clWaitForEvents(1, &event);
00531         clerr |= clReleaseEvent(event);
00532         CLERR // check and clear any existing errors
00533       }
00534       clFinish(clcmdq);
00535 
00536       // Copy the GPU output data back to the host and use/store it..
00537       clEnqueueReadBuffer(clcmdq, devenergy, CL_TRUE, 0, volmemsz, hostenergy,
00538                            0, NULL, NULL);
00539 
00540       CLERR // check and clear any existing errors
00541 
00542       // Copy GPU blocksize padded array back down to the original size
00543       for (y=0; y<numcol; y++) {
00544         long eneraddr = k*numcol*numpt + y*numpt;
00545         memcpy(&grideners[eneraddr], &hostenergy[y*volsize[0]], numpt * sizeof(float));
00546       }
00547  
00548       totaltime = wkf_timer_timenow(timer);
00549       if (wkf_msg_timer_timeout(msgt)) {
00550         // XXX: we have to use printf here as msgInfo is not thread-safe yet.
00551         printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00552                threadid, k, numplane, computedplanes,
00553                totaltime - lasttime, totaltime,
00554                totaltime * numplane / (k+1));
00555       }
00556     }
00557   }
00558 
00559   wkf_timer_destroy(timer); // free timer
00560   wkf_msg_timer_destroy(msgt); // free timer
00561   free(hostenergy);    // free working buffer
00562 
00563 printf("destroying context, programs, etc\n");
00564   clReleaseMemObject(devenergy);
00565   clReleaseMemObject(datominfo);
00566   clReleaseKernel(clenergy);
00567   clReleaseKernel(clenergyvec4);
00568   clReleaseProgram(clpgm);
00569   clReleaseCommandQueue(clcmdq);
00570   clReleaseContext(clctx);
00571 printf("done.\n");
00572 
00573   CLERR // check and clear any existing errors
00574 
00575   return NULL;
00576 }
00577 
00578 
00579 
00580