00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON)
00031 #include <arm_neon.h>
00032
00033 #include <math.h>
00034 #include <stdio.h>
00035 #include "Orbital.h"
00036 #include "DrawMolecule.h"
00037 #include "utilities.h"
00038 #include "Inform.h"
00039 #include "WKFThreads.h"
00040 #include "WKFUtils.h"
00041 #include "ProfileHooks.h"
00042
00043 #define ANGS_TO_BOHR 1.88972612478289694072f
00044
00045
00046 #define __align(X) __attribute__((aligned(X) ))
00047
00048
00049 #define MLOG2EF -1.44269504088896f
00050
00051 #if 0
00052 static void print_float32x4_t(float32x4_t v) {
00053 __attribute__((aligned(16))) float tmp[4];
00054 vst1q_f32(&tmp[0], v);
00055
00056 printf("print_float32x4_t: ");
00057 int i;
00058 for (i=0; i<4; i++)
00059 printf("%g ", tmp[i]);
00060 printf("\n");
00061 }
00062
00063 static void print_int32x4_t(int32x4_t v) {
00064 __attribute__((aligned(16))) int tmp[4];
00065 vst1q_s32(&tmp[0], v);
00066
00067 printf("print_int32x4_t: ");
00068 int i;
00069 for (i=0; i<4; i++)
00070 printf("%d ", tmp[i]);
00071 printf("\n");
00072 }
00073
00074 static void print_hex32x4_t(int32x4_t v) {
00075 __attribute__((aligned(16))) int tmp[4];
00076 vst1q_s32(&tmp[0], v);
00077
00078 printf("print_hex32x4_t: ");
00079 int i;
00080 for (i=0; i<4; i++)
00081 printf("%08x ", tmp[i]);
00082 printf("\n");
00083 }
00084 #endif
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097 #define SCEXP0 1.0000000000000000f
00098 #define SCEXP1 0.6987082824680118f
00099 #define SCEXP2 0.2633174272827404f
00100 #define SCEXP3 0.0923611991471395f
00101 #define SCEXP4 0.0277520543324108f
00102
00103
00104 #define EXPOBIAS 127
00105 #define EXPOSHIFT 23
00106
00107
00108 #define ACUTOFF -10
00109
00110 typedef union NEONreg_t {
00111 float32x4_t f;
00112 int32x4_t i;
00113 } NEONreg;
00114
00115 float32x4_t aexpfnxneon(float32x4_t x) {
00116 __align(16) NEONreg scal;
00117
00118 #if 1
00119
00120
00121
00122
00123
00124
00125
00126
00127 float32x2_t tmp;
00128 tmp = vpmax_f32(vget_low_f32(x), vget_high_f32(x));
00129 tmp = vpmax_f32(tmp, tmp);
00130 float vmax = vget_lane_f32(tmp, 0);
00131 if (vmax < ACUTOFF) {
00132 return vdupq_n_f32(0.0f);
00133 }
00134 #endif
00135
00136 scal.f = vcvtq_f32_u32(vcgeq_f32(x, vdupq_n_f32(ACUTOFF)));
00137
00138
00139
00140
00141
00142
00143
00144 __align(16) NEONreg n;
00145 float32x4_t mb = vmulq_f32(x, vdupq_n_f32(MLOG2EF));
00146 n.i = vcvtq_s32_f32(mb);
00147 float32x4_t mbflr = vcvtq_f32_s32(n.i);
00148 float32x4_t d = vsubq_f32(mbflr, mb);
00149
00150
00151
00152 float32x4_t y;
00153 #if __ARM_FEATURE_FMA
00154 y = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00155 y = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y);
00156 y = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y);
00157 y = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y);
00158 #else
00159 y = vmulq_f32(d, vdupq_n_f32(SCEXP4));
00160 y = vaddq_f32(y, vdupq_n_f32(SCEXP3));
00161 y = vmulq_f32(y, d);
00162 y = vaddq_f32(y, vdupq_n_f32(SCEXP2));
00163 y = vmulq_f32(y, d);
00164 y = vaddq_f32(y, vdupq_n_f32(SCEXP1));
00165 y = vmulq_f32(y, d);
00166 y = vaddq_f32(y, vdupq_n_f32(SCEXP0));
00167 #endif
00168
00169
00170
00171 n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00172 n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00173 scal.i = vandq_s32(scal.i, n.i);
00174 y = vmulq_f32(y, scal.f);
00175
00176 return y;
00177 }
00178
00179
00180
00181
00182
00183 int evaluate_grid_neon(int numatoms,
00184 const float *wave_f, const float *basis_array,
00185 const float *atompos,
00186 const int *atom_basis,
00187 const int *num_shells_per_atom,
00188 const int *num_prim_per_shell,
00189 const int *shell_types,
00190 const int *numvoxels,
00191 float voxelsize,
00192 const float *origin,
00193 int density,
00194 float * orbitalgrid) {
00195 if (!orbitalgrid)
00196 return -1;
00197
00198 int nx, ny, nz;
00199 __attribute__((aligned(16))) float sxdelta[4];
00200 for (nx=0; nx<4; nx++)
00201 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
00202
00203
00204
00205 int numgridxy = numvoxels[0]*numvoxels[1];
00206 for (nz=0; nz<numvoxels[2]; nz++) {
00207 float grid_x, grid_y, grid_z;
00208 grid_z = origin[2] + nz * voxelsize;
00209 for (ny=0; ny<numvoxels[1]; ny++) {
00210 grid_y = origin[1] + ny * voxelsize;
00211 int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
00212 for (nx=0; nx<numvoxels[0]; nx+=4) {
00213 grid_x = origin[0] + nx * voxelsize;
00214
00215
00216
00217 int at;
00218 int prim, shell;
00219
00220
00221 float32x4_t value = vdupq_n_f32(0.0f);
00222
00223
00224 int ifunc = 0;
00225 int shell_counter = 0;
00226
00227
00228 for (at=0; at<numatoms; at++) {
00229 int maxshell = num_shells_per_atom[at];
00230 int prim_counter = atom_basis[at];
00231
00232
00233 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
00234 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00235 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00236
00237 float sydist2 = sydist*sydist;
00238 float szdist2 = szdist*szdist;
00239 float yzdist2 = sydist2 + szdist2;
00240
00241 float32x4_t xdelta = vld1q_f32(&sxdelta[0]);
00242 float32x4_t xdist = vdupq_n_f32(sxdist);
00243 xdist = vaddq_f32(xdist, xdelta);
00244 float32x4_t ydist = vdupq_n_f32(sydist);
00245 float32x4_t zdist = vdupq_n_f32(szdist);
00246 float32x4_t xdist2 = vmulq_f32(xdist, xdist);
00247 float32x4_t ydist2 = vmulq_f32(ydist, ydist);
00248 float32x4_t zdist2 = vmulq_f32(zdist, zdist);
00249 float32x4_t dist2 = vdupq_n_f32(yzdist2);
00250 dist2 = vaddq_f32(dist2, xdist2);
00251
00252
00253
00254
00255
00256
00257 for (shell=0; shell < maxshell; shell++) {
00258 float32x4_t contracted_gto = vdupq_n_f32(0.0f);
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268 int maxprim = num_prim_per_shell[shell_counter];
00269 int shelltype = shell_types[shell_counter];
00270 for (prim=0; prim<maxprim; prim++) {
00271
00272 float exponent = -basis_array[prim_counter ];
00273 float contract_coeff = basis_array[prim_counter + 1];
00274
00275
00276 float32x4_t expval = vmulq_f32(vdupq_n_f32(exponent), dist2);
00277
00278 float32x4_t retval = aexpfnxneon(expval);
00279 contracted_gto = vfmaq_f32(contracted_gto, retval, vdupq_n_f32(contract_coeff));
00280
00281 prim_counter += 2;
00282 }
00283
00284
00285 float32x4_t tmpshell = vdupq_n_f32(0.0f);
00286 switch (shelltype) {
00287
00288 case S_SHELL:
00289 value = vfmaq_f32(value, contracted_gto, vdupq_n_f32(wave_f[ifunc++]));
00290 break;
00291
00292 case P_SHELL:
00293 tmpshell = vfmaq_f32(tmpshell, xdist, vdupq_n_f32(wave_f[ifunc++]));
00294 tmpshell = vfmaq_f32(tmpshell, ydist, vdupq_n_f32(wave_f[ifunc++]));
00295 tmpshell = vfmaq_f32(tmpshell, zdist, vdupq_n_f32(wave_f[ifunc++]));
00296 value = vfmaq_f32(value, contracted_gto, tmpshell);
00297 break;
00298
00299 case D_SHELL:
00300 tmpshell = vfmaq_f32(tmpshell, xdist2, vdupq_n_f32(wave_f[ifunc++]));
00301 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist, ydist), vdupq_n_f32(wave_f[ifunc++]));
00302 tmpshell = vfmaq_f32(tmpshell, ydist2, vdupq_n_f32(wave_f[ifunc++]));
00303 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist, zdist), vdupq_n_f32(wave_f[ifunc++]));
00304 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist, zdist), vdupq_n_f32(wave_f[ifunc++]));
00305 tmpshell = vfmaq_f32(tmpshell, zdist2, vdupq_n_f32(wave_f[ifunc++]));
00306 value = vfmaq_f32(value, contracted_gto, tmpshell);
00307 break;
00308
00309 case F_SHELL:
00310 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, xdist), vdupq_n_f32(wave_f[ifunc++]));
00311 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, ydist), vdupq_n_f32(wave_f[ifunc++]));
00312 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, xdist), vdupq_n_f32(wave_f[ifunc++]));
00313 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, ydist), vdupq_n_f32(wave_f[ifunc++]));
00314 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(xdist2, zdist), vdupq_n_f32(wave_f[ifunc++]));
00315 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(vmulq_f32(xdist, ydist), zdist), vdupq_n_f32(wave_f[ifunc++]));
00316 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(ydist2, zdist), vdupq_n_f32(wave_f[ifunc++]));
00317 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, xdist), vdupq_n_f32(wave_f[ifunc++]));
00318 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, ydist), vdupq_n_f32(wave_f[ifunc++]));
00319 tmpshell = vfmaq_f32(tmpshell, vmulq_f32(zdist2, zdist), vdupq_n_f32(wave_f[ifunc++]));
00320 value = vfmaq_f32(value, contracted_gto, tmpshell);
00321 break;
00322
00323 #if 0
00324 default:
00325
00326 int i, j;
00327 float xdp, ydp, zdp;
00328 float xdiv = 1.0f / xdist;
00329 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00330 int imax = shelltype - j;
00331 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00332 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00333 }
00334 }
00335 value += tmpshell * contracted_gto;
00336 #endif
00337 }
00338
00339 shell_counter++;
00340 }
00341 }
00342
00343
00344 if (density) {
00345 float32x4_t mask = vcvtq_f32_u32(vcltq_f32(value, vdupq_n_f32(0.0f)));
00346 float32x4_t sqdensity = vmulq_f32(value, value);
00347 float32x4_t orbdensity = sqdensity;
00348 float32x4_t nsqdensity = vmulq_f32(sqdensity, mask);
00349 orbdensity = vsubq_f32(orbdensity, nsqdensity);
00350 orbdensity = vsubq_f32(orbdensity, nsqdensity);
00351 vst1q_f32(&orbitalgrid[gaddrzy + nx], orbdensity);
00352 } else {
00353 vst1q_f32(&orbitalgrid[gaddrzy + nx], value);
00354 }
00355 }
00356 }
00357 }
00358
00359 return 0;
00360 }
00361
00362 #endif
00363
00364