00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <math.h>
00026 #if defined(__APPLE__)
00027 #include <OpenCL/cl.h>
00028 #else
00029 #include <CL/cl.h>
00030 #endif
00031
00032 #include "WKFThreads.h"
00033 #include "OpenCLKernels.h"
00034
00035 #if 1
00036 #define CLERR \
00037 if (clerr != CL_SUCCESS) { \
00038 printf("opencl error %d, %s line %d\n", clerr, __FILE__, __LINE__); \
00039 return NULL; \
00040 }
00041 #else
00042 #define CLERR
00043 #endif
00044
00045 typedef union flint_t {
00046 float f;
00047 int i;
00048 } flint;
00049
00050 typedef struct {
00051 int numatoms;
00052 const float *wave_f;
00053 int num_wave_f;
00054 const float *basis_array;
00055 int num_basis;
00056 const float *atompos;
00057 const int *atom_basis;
00058 const int *num_shells_per_atom;
00059 const int *num_prim_per_shell;
00060 const int *shell_symmetry;
00061 int num_shells;
00062 const int *numvoxels;
00063 float voxelsize;
00064 const float *origin;
00065 int density;
00066 float *orbitalgrid;
00067 vmd_opencl_orbital_handle *orbh;
00068 } orbthrparms;
00069
00070
00071 static void * openclorbitalthread(void *);
00072
00073
00074 #define UNROLLX 1
00075 #define UNROLLY 1
00076 #define BLOCKSIZEX 8
00077 #define BLOCKSIZEY 8
00078 #define BLOCKSIZE BLOCKSIZEX * BLOCKSIZEY
00079
00080
00081 #define TILESIZEX BLOCKSIZEX*UNROLLX
00082 #define TILESIZEY BLOCKSIZEY*UNROLLY
00083 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00084 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00085
00086 #define V4UNROLLX 4
00087 #define V4UNROLLY 1
00088 #define V4BLOCKSIZEX 2
00089 #define V4BLOCKSIZEY 8
00090 #define V4BLOCKSIZE BLOCKSIZEX * BLOCKSIZEY
00091
00092
00093 #define V4TILESIZEX V4BLOCKSIZEX*V4UNROLLX
00094 #define V4TILESIZEY V4BLOCKSIZEY*V4UNROLLY
00095 #define V4GPU_X_ALIGNMASK (V4TILESIZEX - 1)
00096 #define V4GPU_Y_ALIGNMASK (V4TILESIZEY - 1)
00097
00098 #define MEMCOALESCE 384
00099
00100
00101 #define S_SHELL 0
00102 #define P_SHELL 1
00103 #define D_SHELL 2
00104 #define F_SHELL 3
00105 #define G_SHELL 4
00106 #define H_SHELL 5
00107
00108
00109
00110
00111 #define MAX_ATOM_SZ 256
00112 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00113 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00114 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00115 #define MAX_BASIS_SZ 6144
00116 #define MAX_SHELL_SZ 1024
00117 #define MAX_WAVEF_SZ 6144
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132 #define MAXSHELLCOUNT 15
00133
00134
00135
00136
00137
00138 #define MEMCOAMASK (~15)
00139
00140
00141
00142
00143
00144
00145 #define SHAREDSIZE 256
00146
00147
00148
00149
00150 const char *clorbitalsrc =
00151 "// unit conversion \n"
00152 "#define ANGS_TO_BOHR 1.8897259877218677f \n"
00153 " \n"
00154 "// orbital shell types \n"
00155 "#define S_SHELL 0 \n"
00156 "#define P_SHELL 1 \n"
00157 "#define D_SHELL 2 \n"
00158 "#define F_SHELL 3 \n"
00159 "#define G_SHELL 4 \n"
00160 "#define H_SHELL 5 \n"
00161 " \n"
00162 "// \n"
00163 "// OpenCL using const memory for almost all of the key arrays \n"
00164 "// \n"
00165 "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00166 "void clorbitalconstmem(int numatoms, \n"
00167 #if defined(__APPLE__)
00168
00169
00170
00171
00172 " __global float *const_atompos, \n"
00173 " __global int *const_atom_basis, \n"
00174 " __global int *const_num_shells_per_atom, \n"
00175 " __global float *const_basis_array, \n"
00176 " __global int *const_num_prim_per_shell, \n"
00177 " __global int *const_shell_symmetry, \n"
00178 " __global float *const_wave_f, \n"
00179 #else
00180 " __constant float *const_atompos, \n"
00181 " __constant int *const_atom_basis, \n"
00182 " __constant int *const_num_shells_per_atom, \n"
00183 " __constant float *const_basis_array, \n"
00184 " __constant int *const_num_prim_per_shell, \n"
00185 " __constant int *const_shell_symmetry, \n"
00186 " __constant float *const_wave_f, \n"
00187 #endif
00188 " float voxelsize, \n"
00189 " float originx, \n"
00190 " float originy, \n"
00191 " float grid_z, \n"
00192 " int density, \n"
00193 " __global float * orbitalgrid) { \n"
00194 " unsigned int xindex = get_global_id(0); \n"
00195 " unsigned int yindex = get_global_id(1); \n"
00196 " unsigned int outaddr = get_global_size(0) * yindex + xindex; \n"
00197 " float grid_x = originx + voxelsize * xindex; \n"
00198 " float grid_y = originy + voxelsize * yindex; \n"
00199 " \n"
00200 " // similar to C version \n"
00201 " int at; \n"
00202 " int prim, shell; \n"
00203 " \n"
00204 " // initialize value of orbital at gridpoint \n"
00205 " float value = 0.0f; \n"
00206 " \n"
00207 " // initialize the wavefunction and shell counters \n"
00208 " int ifunc = 0; \n"
00209 " int shell_counter = 0; \n"
00210 " // loop over all the QM atoms \n"
00211 " for (at = 0; at < numatoms; at++) { \n"
00212 " // calculate distance between grid point and center of atom \n"
00213 " int maxshell = const_num_shells_per_atom[at]; \n"
00214 " int prim_counter = const_atom_basis[at]; \n"
00215 " \n"
00216 " float xdist = (grid_x - const_atompos[3*at ])*ANGS_TO_BOHR; \n"
00217 " float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR; \n"
00218 " float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR; \n"
00219 " \n"
00220 " float xdist2 = xdist*xdist; \n"
00221 " float ydist2 = ydist*ydist; \n"
00222 " float zdist2 = zdist*zdist; \n"
00223 " \n"
00224 " float dist2 = xdist2 + ydist2 + zdist2; \n"
00225 " \n"
00226 " // loop over the shells belonging to this atom (or basis function) \n"
00227 " for (shell=0; shell < maxshell; shell++) { \n"
00228 " float contracted_gto = 0.0f; \n"
00229 " \n"
00230 " // Loop over the Gaussian primitives of this contracted \n"
00231 " // basis function to build the atomic orbital \n"
00232 " int maxprim = const_num_prim_per_shell[shell_counter]; \n"
00233 " int shell_type = const_shell_symmetry[shell_counter]; \n"
00234 " for (prim=0; prim < maxprim; prim++) { \n"
00235 " float exponent = const_basis_array[prim_counter ]; \n"
00236 " float contract_coeff = const_basis_array[prim_counter + 1]; \n"
00237 " \n"
00238 " // By premultiplying the stored exponent factors etc, \n"
00239 " // we can use exp2f() rather than exp(), giving us full \n"
00240 " // precision, but with the speed of __expf() \n"
00241 " contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00242 " prim_counter += 2; \n"
00243 " } \n"
00244 " \n"
00245 " /* multiply with the appropriate wavefunction coefficient */ \n"
00246 " float tmpshell=0.0f; \n"
00247 " switch (shell_type) { \n"
00248 " case S_SHELL: \n"
00249 " value += const_wave_f[ifunc++] * contracted_gto; \n"
00250 " break; \n"
00251 " \n"
00252 " case P_SHELL: \n"
00253 " tmpshell += const_wave_f[ifunc++] * xdist; \n"
00254 " tmpshell += const_wave_f[ifunc++] * ydist; \n"
00255 " tmpshell += const_wave_f[ifunc++] * zdist; \n"
00256 " value += tmpshell * contracted_gto; \n"
00257 " break; \n"
00258 " \n"
00259 " case D_SHELL: \n"
00260 " tmpshell += const_wave_f[ifunc++] * xdist2; \n"
00261 " tmpshell += const_wave_f[ifunc++] * xdist * ydist; \n"
00262 " tmpshell += const_wave_f[ifunc++] * ydist2; \n"
00263 " tmpshell += const_wave_f[ifunc++] * xdist * zdist; \n"
00264 " tmpshell += const_wave_f[ifunc++] * ydist * zdist; \n"
00265 " tmpshell += const_wave_f[ifunc++] * zdist2; \n"
00266 " value += tmpshell * contracted_gto; \n"
00267 " break; \n"
00268 " \n"
00269 " case F_SHELL: \n"
00270 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist; \n"
00271 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist; \n"
00272 " tmpshell += const_wave_f[ifunc++] * ydist2 * xdist; \n"
00273 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist; \n"
00274 " tmpshell += const_wave_f[ifunc++] * xdist2 * zdist; \n"
00275 " tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist; \n"
00276 " tmpshell += const_wave_f[ifunc++] * ydist2 * zdist; \n"
00277 " tmpshell += const_wave_f[ifunc++] * zdist2 * xdist; \n"
00278 " tmpshell += const_wave_f[ifunc++] * zdist2 * ydist; \n"
00279 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist; \n"
00280 " value += tmpshell * contracted_gto; \n"
00281 " break; \n"
00282 " \n"
00283 " case G_SHELL: \n"
00284 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2; \n"
00285 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist; \n"
00286 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2; \n"
00287 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist; \n"
00288 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2; \n"
00289 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist; \n"
00290 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist; \n"
00291 " tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist; \n"
00292 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist; \n"
00293 " tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2; \n"
00294 " tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist; \n"
00295 " tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2; \n"
00296 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist; \n"
00297 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist; \n"
00298 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2; \n"
00299 " value += tmpshell * contracted_gto; \n"
00300 " break; \n"
00301 " } // end switch \n"
00302 " \n"
00303 " shell_counter++; \n"
00304 " } \n"
00305 " } \n"
00306 " \n"
00307 " // return either orbital density or orbital wavefunction amplitude \n"
00308 " if (density) { \n"
00309 " orbitalgrid[outaddr] = copysign(value*value, value); \n"
00310 " } else { \n"
00311 " orbitalgrid[outaddr] = value; \n"
00312 " } \n"
00313 "} \n"
00314 " \n"
00315 " \n"
00316 "// \n"
00317 "// OpenCL using const memory for almost all of the key arrays \n"
00318 "// \n"
00319 "__kernel __attribute__((reqd_work_group_size(V4BLOCKSIZEX, V4BLOCKSIZEY, 1))) \n"
00320 "void clorbitalconstmem_vec4(int numatoms, \n"
00321 #if defined(__APPLE__)
00322
00323
00324
00325
00326 " __global float *const_atompos, \n"
00327 " __global int *const_atom_basis, \n"
00328 " __global int *const_num_shells_per_atom, \n"
00329 " __global float *const_basis_array, \n"
00330 " __global int *const_num_prim_per_shell, \n"
00331 " __global int *const_shell_symmetry, \n"
00332 " __global float *const_wave_f, \n"
00333 #else
00334 " __constant float *const_atompos, \n"
00335 " __constant int *const_atom_basis, \n"
00336 " __constant int *const_num_shells_per_atom, \n"
00337 " __constant float *const_basis_array, \n"
00338 " __constant int *const_num_prim_per_shell, \n"
00339 " __constant int *const_shell_symmetry, \n"
00340 " __constant float *const_wave_f, \n"
00341 #endif
00342 " float voxelsize, \n"
00343 " float originx, \n"
00344 " float originy, \n"
00345 " float grid_z, \n"
00346 " int density, \n"
00347 " __global float * orbitalgrid) { \n"
00348 " unsigned int xindex = (get_global_id(0) - get_local_id(0)) * V4UNROLLX + get_local_id(0); \n"
00349 " unsigned int yindex = get_global_id(1); \n"
00350 " unsigned int outaddr = get_global_size(0) * V4UNROLLX * yindex + xindex;\n"
00351 " float4 gridspacing_v4 = { 0.f, 1.f, 2.f, 3.f }; \n"
00352 " gridspacing_v4 *= (voxelsize * V4BLOCKSIZEX); \n"
00353 " float4 grid_x = originx + voxelsize * xindex + gridspacing_v4; \n"
00354 " float grid_y = originy + voxelsize * yindex; \n"
00355 " \n"
00356 " // similar to C version \n"
00357 " int at; \n"
00358 " int prim, shell; \n"
00359 " \n"
00360 " // initialize value of orbital at gridpoint \n"
00361 " float4 value = 0.0f; \n"
00362 " \n"
00363 " // initialize the wavefunction and shell counters \n"
00364 " int ifunc = 0; \n"
00365 " int shell_counter = 0; \n"
00366 " // loop over all the QM atoms \n"
00367 " for (at = 0; at < numatoms; at++) { \n"
00368 " // calculate distance between grid point and center of atom \n"
00369 " int maxshell = const_num_shells_per_atom[at]; \n"
00370 " int prim_counter = const_atom_basis[at]; \n"
00371 " \n"
00372 " float4 xdist = (grid_x - const_atompos[3*at ])*ANGS_TO_BOHR; \n"
00373 " float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR; \n"
00374 " float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR; \n"
00375 " \n"
00376 " float4 xdist2 = xdist*xdist; \n"
00377 " float ydist2 = ydist*ydist; \n"
00378 " float zdist2 = zdist*zdist; \n"
00379 " \n"
00380 " float4 dist2 = xdist2 + ydist2 + zdist2; \n"
00381 " \n"
00382 " // loop over the shells belonging to this atom (or basis function) \n"
00383 " for (shell=0; shell < maxshell; shell++) { \n"
00384 " float4 contracted_gto = 0.0f; \n"
00385 " \n"
00386 " // Loop over the Gaussian primitives of this contracted \n"
00387 " // basis function to build the atomic orbital \n"
00388 " int maxprim = const_num_prim_per_shell[shell_counter]; \n"
00389 " int shell_type = const_shell_symmetry[shell_counter]; \n"
00390 " for (prim=0; prim < maxprim; prim++) { \n"
00391 " float exponent = const_basis_array[prim_counter ]; \n"
00392 " float contract_coeff = const_basis_array[prim_counter + 1]; \n"
00393 " \n"
00394 " // By premultiplying the stored exponent factors etc, \n"
00395 " // we can use exp2f() rather than exp(), giving us full \n"
00396 " // precision, but with the speed of __expf() \n"
00397 " contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00398 " prim_counter += 2; \n"
00399 " } \n"
00400 " \n"
00401 " /* multiply with the appropriate wavefunction coefficient */ \n"
00402 " float4 tmpshell=0.0f; \n"
00403 " switch (shell_type) { \n"
00404 " case S_SHELL: \n"
00405 " value += const_wave_f[ifunc++] * contracted_gto; \n"
00406 " break; \n"
00407 " \n"
00408 " case P_SHELL: \n"
00409 " tmpshell += const_wave_f[ifunc++] * xdist; \n"
00410 " tmpshell += const_wave_f[ifunc++] * ydist; \n"
00411 " tmpshell += const_wave_f[ifunc++] * zdist; \n"
00412 " value += tmpshell * contracted_gto; \n"
00413 " break; \n"
00414 " \n"
00415 " case D_SHELL: \n"
00416 " tmpshell += const_wave_f[ifunc++] * xdist2; \n"
00417 " tmpshell += const_wave_f[ifunc++] * xdist * ydist; \n"
00418 " tmpshell += const_wave_f[ifunc++] * ydist2; \n"
00419 " tmpshell += const_wave_f[ifunc++] * xdist * zdist; \n"
00420 " tmpshell += const_wave_f[ifunc++] * ydist * zdist; \n"
00421 " tmpshell += const_wave_f[ifunc++] * zdist2; \n"
00422 " value += tmpshell * contracted_gto; \n"
00423 " break; \n"
00424 " \n"
00425 " case F_SHELL: \n"
00426 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist; \n"
00427 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist; \n"
00428 " tmpshell += const_wave_f[ifunc++] * ydist2 * xdist; \n"
00429 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist; \n"
00430 " tmpshell += const_wave_f[ifunc++] * xdist2 * zdist; \n"
00431 " tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist; \n"
00432 " tmpshell += const_wave_f[ifunc++] * ydist2 * zdist; \n"
00433 " tmpshell += const_wave_f[ifunc++] * zdist2 * xdist; \n"
00434 " tmpshell += const_wave_f[ifunc++] * zdist2 * ydist; \n"
00435 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist; \n"
00436 " value += tmpshell * contracted_gto; \n"
00437 " break; \n"
00438 " \n"
00439 " case G_SHELL: \n"
00440 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2; \n"
00441 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist; \n"
00442 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2; \n"
00443 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist; \n"
00444 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2; \n"
00445 " tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist; \n"
00446 " tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist; \n"
00447 " tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist; \n"
00448 " tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist; \n"
00449 " tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2; \n"
00450 " tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist; \n"
00451 " tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2; \n"
00452 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist; \n"
00453 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist; \n"
00454 " tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2; \n"
00455 " value += tmpshell * contracted_gto; \n"
00456 " break; \n"
00457 " } // end switch \n"
00458 " \n"
00459 " shell_counter++; \n"
00460 " } \n"
00461 " } \n"
00462 " \n"
00463 " // return either orbital density or orbital wavefunction amplitude \n"
00464 " if (density) { \n"
00465 " orbitalgrid[outaddr ] = copysign(value.x*value.x, value.x); \n"
00466 " orbitalgrid[outaddr+1*V4BLOCKSIZEX] = copysign(value.y*value.y, value.y); \n"
00467 " orbitalgrid[outaddr+2*V4BLOCKSIZEX] = copysign(value.z*value.z, value.z); \n"
00468 " orbitalgrid[outaddr+3*V4BLOCKSIZEX] = copysign(value.w*value.w, value.w); \n"
00469 " } else { \n"
00470 " orbitalgrid[outaddr ] = value.x; \n"
00471 " orbitalgrid[outaddr+1*V4BLOCKSIZEX] = value.y; \n"
00472 " orbitalgrid[outaddr+2*V4BLOCKSIZEX] = value.z; \n"
00473 " orbitalgrid[outaddr+3*V4BLOCKSIZEX] = value.w; \n"
00474 " } \n"
00475 "} \n"
00476 " \n"
00477 " \n"
00478 "// \n"
00479 "// OpenCL loading from global memory into shared memory for all arrays \n"
00480 "// \n"
00481 " \n"
00482 "typedef union flint_t { \n"
00483 " float f; \n"
00484 " int i; \n"
00485 "} flint; \n"
00486 " \n"
00487 "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00488 "void clorbitaltiledshared(int numatoms, \n"
00489 " __global float *wave_f, \n"
00490 " __global float *basis_array, \n"
00491 " __global flint *atominfo, \n"
00492 " __global int *shellinfo, \n"
00493 " float voxelsize, \n"
00494 " float originx, \n"
00495 " float originy, \n"
00496 " float grid_z, \n"
00497 " int density, \n"
00498 " __global float *orbitalgrid) { \n"
00499 " unsigned int xindex = get_global_id(0); \n"
00500 " unsigned int yindex = get_global_id(1); \n"
00501 " unsigned int outaddr = get_global_size(0) * yindex + xindex; \n"
00502 " float grid_x = originx + voxelsize * xindex; \n"
00503 " float grid_y = originy + voxelsize * yindex; \n"
00504 " \n"
00505 " // XXX NVIDIA-specific warp logic \n"
00506 " int sidx = get_local_id(1) * get_local_size(0) + get_local_id(0); \n"
00507 " \n"
00508 " __local float s_wave_f[SHAREDSIZE]; \n"
00509 " int sblock_wave_f = 0; \n"
00510 " s_wave_f[sidx ] = wave_f[sidx ]; \n"
00511 " s_wave_f[sidx + 64] = wave_f[sidx + 64]; \n"
00512 " s_wave_f[sidx + 128] = wave_f[sidx + 128]; \n"
00513 " s_wave_f[sidx + 192] = wave_f[sidx + 192]; \n"
00514 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00515 " \n"
00516 " // similar to C version \n"
00517 " int at; \n"
00518 " int prim, shell; \n"
00519 " \n"
00520 " // initialize value of orbital at gridpoint \n"
00521 " float value = 0.0f; \n"
00522 " \n"
00523 " // initialize the wavefunction and primitive counters \n"
00524 " int ifunc = 0; \n"
00525 " int shell_counter = 0; \n"
00526 " int sblock_prim_counter = -1; // sentinel indicates no data loaded \n"
00527 " // loop over all the QM atoms \n"
00528 " for (at = 0; at < numatoms; at++) { \n"
00529 " __local flint s_atominfo[5]; \n"
00530 " __local float s_basis_array[SHAREDSIZE]; \n"
00531 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00532 " if (sidx < 5) \n"
00533 " s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i; \n"
00534 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00535 " int prim_counter = s_atominfo[3].i; \n"
00536 " int maxshell = s_atominfo[4].i; \n"
00537 " int new_sblock_prim_counter = prim_counter & MEMCOAMASK; \n"
00538 " if (sblock_prim_counter != new_sblock_prim_counter) { \n"
00539 " sblock_prim_counter = new_sblock_prim_counter; \n"
00540 " s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ]; \n"
00541 " s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64]; \n"
00542 " s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128]; \n"
00543 " s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192]; \n"
00544 " prim_counter -= sblock_prim_counter; \n"
00545 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00546 " } \n"
00547 " \n"
00548 " // calculate distance between grid point and center of atom \n"
00549 " float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR; \n"
00550 " float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR; \n"
00551 " float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR; \n"
00552 " \n"
00553 " float xdist2 = xdist*xdist; \n"
00554 " float ydist2 = ydist*ydist; \n"
00555 " float zdist2 = zdist*zdist; \n"
00556 " \n"
00557 " float dist2 = xdist2 + ydist2 + zdist2; \n"
00558 " \n"
00559 " // loop over the shells belonging to this atom (or basis function) \n"
00560 " for (shell=0; shell < maxshell; shell++) { \n"
00561 " float contracted_gto = 0.0f; \n"
00562 " \n"
00563 " // Loop over the Gaussian primitives of this contracted \n"
00564 " // basis function to build the atomic orbital \n"
00565 " __local int s_shellinfo[2]; \n"
00566 " \n"
00567 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00568 " if (sidx < 2) \n"
00569 " s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx]; \n"
00570 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00571 " int maxprim = s_shellinfo[0]; \n"
00572 " int shell_type = s_shellinfo[1]; \n"
00573 " \n"
00574 " if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) { \n"
00575 " prim_counter += sblock_prim_counter; \n"
00576 " sblock_prim_counter = prim_counter & MEMCOAMASK; \n"
00577 " s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ]; \n"
00578 " s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64]; \n"
00579 " s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128]; \n"
00580 " s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192]; \n"
00581 " prim_counter -= sblock_prim_counter; \n"
00582 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00583 " } \n"
00584 " for (prim=0; prim < maxprim; prim++) { \n"
00585 " float exponent = s_basis_array[prim_counter ]; \n"
00586 " float contract_coeff = s_basis_array[prim_counter + 1]; \n"
00587 " \n"
00588 " // By premultiplying the stored exponent factors etc, \n"
00589 " // we can use exp2f() rather than exp(), giving us full \n"
00590 " // precision, but with the speed of __expf() \n"
00591 " contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00592 " \n"
00593 " prim_counter += 2; \n"
00594 " } \n"
00595 " \n"
00596 " // XXX should use a constant memory lookup table to store \n"
00597 " // shared mem refill constants, and dynamically lookup the \n"
00598 " // number of elements referenced in the next iteration. \n"
00599 " if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) { \n"
00600 " ifunc += sblock_wave_f; \n"
00601 " sblock_wave_f = ifunc & MEMCOAMASK; \n"
00602 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00603 " s_wave_f[sidx ] = wave_f[sblock_wave_f + sidx ]; \n"
00604 " s_wave_f[sidx + 64] = wave_f[sblock_wave_f + sidx + 64]; \n"
00605 " s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128]; \n"
00606 " s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192]; \n"
00607 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00608 " ifunc -= sblock_wave_f; \n"
00609 " } \n"
00610 " \n"
00611 " /* multiply with the appropriate wavefunction coefficient */ \n"
00612 " float tmpshell=0.0f; \n"
00613 " switch (shell_type) { \n"
00614 " case S_SHELL: \n"
00615 " value += s_wave_f[ifunc++] * contracted_gto; \n"
00616 " break; \n"
00617 " \n"
00618 " case P_SHELL: \n"
00619 " tmpshell += s_wave_f[ifunc++] * xdist; \n"
00620 " tmpshell += s_wave_f[ifunc++] * ydist; \n"
00621 " tmpshell += s_wave_f[ifunc++] * zdist; \n"
00622 " value += tmpshell * contracted_gto; \n"
00623 " break; \n"
00624 " \n"
00625 " case D_SHELL: \n"
00626 " tmpshell += s_wave_f[ifunc++] * xdist2; \n"
00627 " tmpshell += s_wave_f[ifunc++] * xdist * ydist; \n"
00628 " tmpshell += s_wave_f[ifunc++] * ydist2; \n"
00629 " tmpshell += s_wave_f[ifunc++] * xdist * zdist; \n"
00630 " tmpshell += s_wave_f[ifunc++] * ydist * zdist; \n"
00631 " tmpshell += s_wave_f[ifunc++] * zdist2; \n"
00632 " value += tmpshell * contracted_gto; \n"
00633 " break; \n"
00634 " \n"
00635 " case F_SHELL: \n"
00636 " tmpshell += s_wave_f[ifunc++] * xdist2 * xdist; \n"
00637 " tmpshell += s_wave_f[ifunc++] * xdist2 * ydist; \n"
00638 " tmpshell += s_wave_f[ifunc++] * ydist2 * xdist; \n"
00639 " tmpshell += s_wave_f[ifunc++] * ydist2 * ydist; \n"
00640 " tmpshell += s_wave_f[ifunc++] * xdist2 * zdist; \n"
00641 " tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist; \n"
00642 " tmpshell += s_wave_f[ifunc++] * ydist2 * zdist; \n"
00643 " tmpshell += s_wave_f[ifunc++] * zdist2 * xdist; \n"
00644 " tmpshell += s_wave_f[ifunc++] * zdist2 * ydist; \n"
00645 " tmpshell += s_wave_f[ifunc++] * zdist2 * zdist; \n"
00646 " value += tmpshell * contracted_gto; \n"
00647 " break; \n"
00648 " \n"
00649 " case G_SHELL: \n"
00650 " tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2; \n"
00651 " tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist; \n"
00652 " tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2; \n"
00653 " tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist; \n"
00654 " tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2; \n"
00655 " tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist; \n"
00656 " tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist; \n"
00657 " tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist; \n"
00658 " tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist; \n"
00659 " tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2; \n"
00660 " tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist; \n"
00661 " tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2; \n"
00662 " tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist; \n"
00663 " tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist; \n"
00664 " tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2; \n"
00665 " value += tmpshell * contracted_gto; \n"
00666 " break; \n"
00667 " } // end switch \n"
00668 " \n"
00669 " shell_counter++; \n"
00670 " } \n"
00671 " } \n"
00672 " \n"
00673 " // return either orbital density or orbital wavefunction amplitude \n"
00674 " if (density) { \n"
00675 " orbitalgrid[outaddr] = copysign(value*value, value); \n"
00676 " } else { \n"
00677 " orbitalgrid[outaddr] = value; \n"
00678 " } \n"
00679 "} \n"
00680 " \n"
00681 " \n";
00682
00683
00684 cl_program vmd_opencl_compile_orbital_pgm(cl_context clctx, cl_device_id *cldevs, int &clerr) {
00685 cl_program clpgm = NULL;
00686
00687 clpgm = clCreateProgramWithSource(clctx, 1, &clorbitalsrc, NULL, &clerr);
00688 CLERR
00689
00690 char clcompileflags[4096];
00691 sprintf(clcompileflags, "-DUNROLLX=%d -DUNROLLY=%d -DBLOCKSIZEX=%d -DBLOCKSIZEY=%d -DBLOCKSIZE=%d -DSHAREDSIZE=%d -DMEMCOAMASK=%d -DMAXSHELLCOUNT=%d -DV4UNROLLX=%d -DV4UNROLLY=%d -DV4BLOCKSIZEX=%d -DV4BLOCKSIZEY=%d -cl-fast-relaxed-math -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros", UNROLLX, UNROLLY, BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZE, SHAREDSIZE, MEMCOAMASK, MAXSHELLCOUNT, V4UNROLLX, V4UNROLLY, V4BLOCKSIZEX, V4BLOCKSIZEY);
00692 clerr = clBuildProgram(clpgm, 0, NULL, clcompileflags, NULL, NULL);
00693 if (clerr != CL_SUCCESS)
00694 printf(" compilation failed!\n");
00695
00696 if (cldevs) {
00697 char buildlog[8192];
00698 size_t len=0;
00699 clerr = clGetProgramBuildInfo(clpgm, cldevs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildlog), buildlog, &len);
00700 if (len > 1) {
00701 printf("OpenCL compilation log:\n");
00702 printf(" '%s'\n", buildlog);
00703 }
00704 CLERR
00705 }
00706
00707 return clpgm;
00708 }
00709
00710
00711 vmd_opencl_orbital_handle * vmd_opencl_create_orbital_handle(cl_context ctx, cl_command_queue cmdq, cl_device_id *devs) {
00712 vmd_opencl_orbital_handle * orbh;
00713 cl_int clerr = CL_SUCCESS;
00714
00715 orbh = (vmd_opencl_orbital_handle *) malloc(sizeof(vmd_opencl_orbital_handle));
00716 orbh->ctx = ctx;
00717 orbh->cmdq = cmdq;
00718 orbh->devs = devs;
00719
00720 orbh->pgm = vmd_opencl_compile_orbital_pgm(orbh->ctx, orbh->devs, clerr);
00721 CLERR
00722 orbh->kconstmem = clCreateKernel(orbh->pgm, "clorbitalconstmem", &clerr);
00723 CLERR
00724 orbh->kconstmem_vec4 = clCreateKernel(orbh->pgm, "clorbitalconstmem_vec4", &clerr);
00725 CLERR
00726 orbh->ktiledshared = clCreateKernel(orbh->pgm, "clorbitaltiledshared", &clerr);
00727 CLERR
00728
00729 return orbh;
00730 }
00731
00732
00733 int vmd_opencl_destroy_orbital_handle(vmd_opencl_orbital_handle * orbh) {
00734 clReleaseKernel(orbh->kconstmem);
00735 clReleaseKernel(orbh->kconstmem_vec4);
00736 clReleaseKernel(orbh->ktiledshared);
00737 clReleaseProgram(orbh->pgm);
00738 free(orbh);
00739
00740 return 0;
00741 }
00742
00743
00744
00745 static int computepaddedsize(int orig, int tilesize) {
00746 int alignmask = tilesize - 1;
00747 int paddedsz = (orig + alignmask) & ~alignmask;
00748
00749 return paddedsz;
00750 }
00751
00752 static void * openclorbitalthread(void *voidparms) {
00753 size_t volsize[3], Gsz[3], Bsz[3];
00754 cl_mem d_wave_f = NULL;
00755 cl_mem d_basis_array = NULL;
00756 cl_mem d_atominfo = NULL;
00757 cl_mem d_shellinfo = NULL;
00758 cl_mem d_origin = NULL;
00759 cl_mem d_numvoxels = NULL;
00760 cl_mem d_orbitalgrid = NULL;
00761 cl_mem const_wave_f=NULL;
00762 cl_mem const_atompos=NULL;
00763 cl_mem const_atom_basis=NULL;
00764 cl_mem const_num_shells_per_atom=NULL;
00765 cl_mem const_basis_array=NULL;
00766 cl_mem const_num_prim_per_shell=NULL;
00767 cl_mem const_shell_symmetry=NULL;
00768 float *h_orbitalgrid = NULL;
00769 float *h_basis_array_exp2f = NULL;
00770 int numvoxels[3];
00771 float origin[3];
00772 orbthrparms *parms = NULL;
00773 int usefastconstkernel=0;
00774 int threadid=0;
00775 int tilesize = 1;
00776 cl_int clerr = CL_SUCCESS;
00777
00778 #if defined(USE_OPENCLDEVPOOL)
00779 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00780 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00781
00782
00783 tilesize=4;
00784 wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00785 #else
00786 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00787 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00788 #endif
00789
00790
00791 vmd_opencl_orbital_handle * orbh = parms->orbh;
00792 cl_context clctx = parms->orbh->ctx;
00793 cl_command_queue clcmdq = parms->orbh->cmdq;
00794
00795 numvoxels[0] = parms->numvoxels[0];
00796 numvoxels[1] = parms->numvoxels[1];
00797 numvoxels[2] = 1;
00798
00799 origin[0] = parms->origin[0];
00800 origin[1] = parms->origin[1];
00801
00802 int density = parms->density;
00803
00804
00805 if (getenv("VMDMOVEC4")) {
00806
00807 volsize[0] = (parms->numvoxels[0] + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK);
00808 volsize[1] = (parms->numvoxels[1] + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK);
00809 volsize[2] = 1;
00810 Bsz[0] = V4BLOCKSIZEX;
00811 Bsz[1] = V4BLOCKSIZEY;
00812 Bsz[2] = 1;
00813 Gsz[0] = volsize[0] / V4UNROLLX;
00814 Gsz[1] = volsize[1] / V4UNROLLY;
00815 Gsz[2] = 1;
00816 } else {
00817 volsize[0] = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00818 volsize[1] = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00819 volsize[2] = 1;
00820 Bsz[0] = BLOCKSIZEX;
00821 Bsz[1] = BLOCKSIZEY;
00822 Bsz[2] = 1;
00823 Gsz[0] = volsize[0] / UNROLLX;
00824 Gsz[1] = volsize[1] / UNROLLY;
00825 Gsz[2] = 1;
00826 }
00827 int volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
00828
00829
00830
00831 if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
00832 (parms->numatoms < MAX_ATOM_SZ) &&
00833 (parms->numatoms < MAX_ATOMSHELL_SZ) &&
00834 (2*parms->num_basis < MAX_BASIS_SZ) &&
00835 (parms->num_shells < MAX_SHELL_SZ)) {
00836 usefastconstkernel=1;
00837 }
00838
00839
00840 if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
00841 usefastconstkernel=0;
00842 }
00843
00844
00845 int padsz;
00846 padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
00847 h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
00848 float log2e = log2(2.718281828);
00849 for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
00850 #if 1
00851
00852 h_basis_array_exp2f[ll ] = parms->basis_array[ll ] * log2e;
00853 #else
00854 h_basis_array_exp2f[ll ] = parms->basis_array[ll ];
00855 #endif
00856 h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
00857 }
00858
00859 if (usefastconstkernel) {
00860 const_wave_f=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_wave_f * sizeof(float), NULL, NULL);
00861 clEnqueueWriteBuffer(clcmdq, const_wave_f, CL_TRUE, 0, parms->num_wave_f * sizeof(float), parms->wave_f, 0, NULL, NULL);
00862
00863 const_atompos=clCreateBuffer(clctx, CL_MEM_READ_ONLY, 3 * parms->numatoms * sizeof(float), NULL, NULL);
00864 clEnqueueWriteBuffer(clcmdq, const_atompos, CL_TRUE, 0, 3 * parms->numatoms * sizeof(float), parms->atompos, 0, NULL, NULL);
00865
00866 const_atom_basis=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->numatoms * sizeof(int), NULL, NULL);
00867 clEnqueueWriteBuffer(clcmdq, const_atom_basis, CL_TRUE, 0, parms->numatoms * sizeof(int), parms->atom_basis, 0, NULL, NULL);
00868
00869 const_num_shells_per_atom=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->numatoms * sizeof(int), NULL, NULL);
00870 clEnqueueWriteBuffer(clcmdq, const_num_shells_per_atom, CL_TRUE, 0, parms->numatoms * sizeof(int), parms->num_shells_per_atom, 0, NULL, NULL);
00871
00872 const_basis_array=clCreateBuffer(clctx, CL_MEM_READ_ONLY, 2 * parms->num_basis * sizeof(float), NULL, NULL);
00873 clEnqueueWriteBuffer(clcmdq, const_basis_array, CL_TRUE, 0, 2 * parms->num_basis * sizeof(float), h_basis_array_exp2f, 0, NULL, NULL);
00874
00875 const_num_prim_per_shell=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_shells * sizeof(int), NULL, NULL);
00876 clEnqueueWriteBuffer(clcmdq, const_num_prim_per_shell, CL_TRUE, 0, parms->num_shells * sizeof(int), parms->num_prim_per_shell, 0, NULL, NULL);
00877
00878 const_shell_symmetry=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_shells * sizeof(int), NULL, NULL);
00879 clEnqueueWriteBuffer(clcmdq, const_shell_symmetry, CL_TRUE, 0, parms->num_shells * sizeof(int), parms->shell_symmetry, 0, NULL, NULL);
00880 } else {
00881 padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
00882 d_wave_f = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00883 clEnqueueWriteBuffer(clcmdq, d_wave_f, CL_TRUE, 0, parms->num_wave_f * sizeof(float), parms->wave_f, 0, NULL, NULL);
00884
00885
00886 padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
00887 flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
00888 d_atominfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(flint), NULL, NULL);
00889 int ll;
00890 for (ll=0; ll<parms->numatoms; ll++) {
00891 int addr = ll * 16;
00892 h_atominfo[addr ].f = parms->atompos[ll*3 ];
00893 h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
00894 h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
00895 h_atominfo[addr + 3].i = parms->atom_basis[ll];
00896 h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
00897 }
00898 clEnqueueWriteBuffer(clcmdq, d_atominfo, CL_TRUE, 0, padsz * sizeof(flint), h_atominfo, 0, NULL, NULL);
00899 free(h_atominfo);
00900
00901 padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
00902 int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
00903 d_shellinfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(int), NULL, NULL);
00904 for (ll=0; ll<parms->num_shells; ll++) {
00905 h_shellinfo[ll*16 ] = parms->num_prim_per_shell[ll];
00906 h_shellinfo[ll*16 + 1] = parms->shell_symmetry[ll];
00907 }
00908 clEnqueueWriteBuffer(clcmdq, d_shellinfo, CL_TRUE, 0, padsz * sizeof(int), h_shellinfo, 0, NULL, NULL);
00909 free(h_shellinfo);
00910
00911 d_basis_array = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00912 clEnqueueWriteBuffer(clcmdq, d_basis_array, CL_TRUE, 0, 2 * parms->num_basis * sizeof(float), h_basis_array_exp2f, 0, NULL, NULL);
00913
00914 padsz = computepaddedsize(3, MEMCOALESCE);
00915 d_origin = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00916 clEnqueueWriteBuffer(clcmdq, d_origin, CL_TRUE, 0, 3 * sizeof(float), origin, 0, NULL, NULL);
00917
00918 d_numvoxels = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(int), NULL, NULL);
00919 clEnqueueWriteBuffer(clcmdq, d_numvoxels, CL_TRUE, 0, 3 * sizeof(int), numvoxels, 0, NULL, NULL);
00920 }
00921
00922
00923 d_orbitalgrid = clCreateBuffer(clctx, CL_MEM_READ_WRITE, volmemsz, NULL, NULL);
00924 CLERR
00925
00926
00927 h_orbitalgrid = (float *) malloc(volmemsz);
00928
00929 #if 0
00930 if (threadid == 0) {
00931
00932 printf("atoms[%d] ", parms->numatoms);
00933 printf("wavef[%d] ", parms->num_wave_f);
00934 printf("basis[%d] ", parms->num_basis);
00935 printf("shell[%d] ", parms->num_shells);
00936 if (usefastconstkernel) {
00937 printf("GPU constant memory");
00938 } else {
00939 printf("GPU tiled shared memory:");
00940 }
00941 printf(" Gsz:%dx%d\n", (int) Gsz[0], (int) Gsz[1]);
00942 }
00943 #endif
00944
00945
00946 wkf_tasktile_t tile;
00947 int planesize = numvoxels[0] * numvoxels[1];
00948
00949 #if defined(USE_OPENCLDEVPOOL)
00950 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00951 #else
00952 while (wkf_threadlaunch_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00953 #endif
00954 int k;
00955 for (k=tile.start; k<tile.end; k++) {
00956 origin[2] = parms->origin[2] + parms->voxelsize * k;
00957
00958
00959
00960 if (usefastconstkernel) {
00961 cl_event event;
00962 if (getenv("VMDMOVEC4")) {
00963 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 0, sizeof(int), &parms->numatoms);
00964 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 1, sizeof(cl_mem), &const_atompos);
00965 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 2, sizeof(cl_mem), &const_atom_basis);
00966 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 3, sizeof(cl_mem), &const_num_shells_per_atom);
00967 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 4, sizeof(cl_mem), &const_basis_array);
00968 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 5, sizeof(cl_mem), &const_num_prim_per_shell);
00969 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 6, sizeof(cl_mem), &const_shell_symmetry);
00970 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 7, sizeof(cl_mem), &const_wave_f);
00971 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 8, sizeof(float), &parms->voxelsize);
00972 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 9, sizeof(float), &origin[0]);
00973 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 10, sizeof(float), &origin[1]);
00974 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 11, sizeof(float), &origin[2]);
00975 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 12, sizeof(cl_int), &density);
00976 clerr |= clSetKernelArg(orbh->kconstmem_vec4, 13, sizeof(cl_mem), &d_orbitalgrid);
00977 clerr = clEnqueueNDRangeKernel(clcmdq, orbh->kconstmem_vec4, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00978 } else {
00979 clerr |= clSetKernelArg(orbh->kconstmem, 0, sizeof(int), &parms->numatoms);
00980 clerr |= clSetKernelArg(orbh->kconstmem, 1, sizeof(cl_mem), &const_atompos);
00981 clerr |= clSetKernelArg(orbh->kconstmem, 2, sizeof(cl_mem), &const_atom_basis);
00982 clerr |= clSetKernelArg(orbh->kconstmem, 3, sizeof(cl_mem), &const_num_shells_per_atom);
00983 clerr |= clSetKernelArg(orbh->kconstmem, 4, sizeof(cl_mem), &const_basis_array);
00984 clerr |= clSetKernelArg(orbh->kconstmem, 5, sizeof(cl_mem), &const_num_prim_per_shell);
00985 clerr |= clSetKernelArg(orbh->kconstmem, 6, sizeof(cl_mem), &const_shell_symmetry);
00986 clerr |= clSetKernelArg(orbh->kconstmem, 7, sizeof(cl_mem), &const_wave_f);
00987 clerr |= clSetKernelArg(orbh->kconstmem, 8, sizeof(float), &parms->voxelsize);
00988 clerr |= clSetKernelArg(orbh->kconstmem, 9, sizeof(float), &origin[0]);
00989 clerr |= clSetKernelArg(orbh->kconstmem, 10, sizeof(float), &origin[1]);
00990 clerr |= clSetKernelArg(orbh->kconstmem, 11, sizeof(float), &origin[2]);
00991 clerr |= clSetKernelArg(orbh->kconstmem, 12, sizeof(cl_int), &density);
00992 clerr |= clSetKernelArg(orbh->kconstmem, 13, sizeof(cl_mem), &d_orbitalgrid);
00993 clerr = clEnqueueNDRangeKernel(clcmdq, orbh->kconstmem, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00994 }
00995 CLERR
00996 clerr |= clWaitForEvents(1, &event);
00997 clerr |= clReleaseEvent(event);
00998 CLERR
00999 } else {
01000 clerr |= clSetKernelArg(orbh->ktiledshared, 0, sizeof(int), &parms->numatoms);
01001 clerr |= clSetKernelArg(orbh->ktiledshared, 1, sizeof(cl_mem), &d_wave_f);
01002 clerr |= clSetKernelArg(orbh->ktiledshared, 2, sizeof(cl_mem), &d_basis_array);
01003 clerr |= clSetKernelArg(orbh->ktiledshared, 3, sizeof(cl_mem), &d_atominfo);
01004 clerr |= clSetKernelArg(orbh->ktiledshared, 4, sizeof(cl_mem), &d_shellinfo);
01005 clerr |= clSetKernelArg(orbh->ktiledshared, 5, sizeof(float), &parms->voxelsize);
01006 clerr |= clSetKernelArg(orbh->ktiledshared, 6, sizeof(float), &origin[0]);
01007 clerr |= clSetKernelArg(orbh->ktiledshared, 7, sizeof(float), &origin[1]);
01008 clerr |= clSetKernelArg(orbh->ktiledshared, 8, sizeof(float), &origin[2]);
01009 clerr |= clSetKernelArg(orbh->ktiledshared, 9, sizeof(cl_int), &density);
01010 clerr |= clSetKernelArg(orbh->ktiledshared, 10, sizeof(cl_mem), &d_orbitalgrid);
01011 cl_event event;
01012 clerr = clEnqueueNDRangeKernel(clcmdq, orbh->ktiledshared, 2, NULL, Gsz, Bsz, 0, NULL, &event);
01013 CLERR
01014 clerr |= clWaitForEvents(1, &event);
01015 clerr |= clReleaseEvent(event);
01016 CLERR
01017 }
01018 CLERR
01019
01020
01021 clEnqueueReadBuffer(clcmdq, d_orbitalgrid, CL_TRUE, 0, volmemsz, h_orbitalgrid, 0, NULL, NULL);
01022 CLERR
01023 clFinish(clcmdq);
01024
01025
01026 int y;
01027 for (y=0; y<numvoxels[1]; y++) {
01028 long orbaddr = k*planesize + y*numvoxels[0];
01029 memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize[0]], numvoxels[0] * sizeof(float));
01030 }
01031 }
01032 }
01033 clFinish(clcmdq);
01034
01035
01036
01037
01038 free(h_basis_array_exp2f);
01039 free(h_orbitalgrid);
01040
01041 if (usefastconstkernel) {
01042
01043 clReleaseMemObject(const_wave_f);
01044 clReleaseMemObject(const_atompos);
01045 clReleaseMemObject(const_atom_basis);
01046 clReleaseMemObject(const_num_shells_per_atom);
01047 clReleaseMemObject(const_basis_array);
01048 clReleaseMemObject(const_num_prim_per_shell);
01049 clReleaseMemObject(const_shell_symmetry);
01050 } else {
01051
01052 clReleaseMemObject(d_wave_f);
01053 clReleaseMemObject(d_basis_array);
01054 clReleaseMemObject(d_atominfo);
01055 clReleaseMemObject(d_shellinfo);
01056 clReleaseMemObject(d_numvoxels);
01057 clReleaseMemObject(d_origin);
01058 }
01059 clReleaseMemObject(d_orbitalgrid);
01060 CLERR
01061
01062 return NULL;
01063 }
01064
01065
01066
01067 int vmd_opencl_evaluate_orbital_grid(wkf_threadpool_t *devpool,
01068 vmd_opencl_orbital_handle *orbh,
01069 int numatoms,
01070 const float *wave_f, int num_wave_f,
01071 const float *basis_array, int num_basis,
01072 const float *atompos,
01073 const int *atom_basis,
01074 const int *num_shells_per_atom,
01075 const int *num_prim_per_shell,
01076 const int *shell_symmetry,
01077 int num_shells,
01078 const int *numvoxels,
01079 float voxelsize,
01080 const float *origin,
01081 int density,
01082 float *orbitalgrid) {
01083 int rc=0;
01084 orbthrparms parms;
01085
01086 parms.numatoms = numatoms;
01087 parms.wave_f = wave_f;
01088 parms.num_wave_f = num_wave_f;
01089 parms.basis_array = basis_array;
01090 parms.num_basis = num_basis;
01091 parms.atompos = atompos;
01092 parms.atom_basis = atom_basis;
01093 parms.num_shells_per_atom = num_shells_per_atom;
01094 parms.num_prim_per_shell = num_prim_per_shell;
01095 parms.shell_symmetry = shell_symmetry;
01096 parms.num_shells = num_shells;
01097 parms.numvoxels = numvoxels;
01098 parms.voxelsize = voxelsize;
01099 parms.origin = origin;
01100 parms.density = density;
01101 parms.orbitalgrid = orbitalgrid;
01102 parms.orbh = orbh;
01103
01104 wkf_tasktile_t tile;
01105 tile.start=0;
01106 tile.end=numvoxels[2];
01107
01108
01109
01110
01111
01112 #if defined(USE_OPENCLDEVPOOL)
01113 wkf_threadpool_sched_dynamic(devpool, &tile);
01114 wkf_threadpool_launch(devpool, openclorbitalthread, &parms, 1);
01115 #else
01116
01117 rc = wkf_threadlaunch(1, &parms, openclorbitalthread, &tile);
01118 #endif
01119
01120 return rc;
01121 }
01122
01123