00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00022
00023
00024
00025 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON)
00026 #include <arm_neon.h>
00027
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 #include <math.h>
00032 #include "QuickSurf.h"
00033 #include "Inform.h"
00034 #include "utilities.h"
00035 #include "WKFUtils.h"
00036 #include "VolumetricData.h"
00037
00038 #include "VMDDisplayList.h"
00039 #include "Displayable.h"
00040 #include "DispCmds.h"
00041
00042 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00043 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00086 #define __align(X) __attribute__((aligned(X) ))
00087 #else
00088 #define __align(X) __declspec(align(X) )
00089 #endif
00090
00091 #define MLOG2EF -1.44269504088896f
00092
00093
00094
00095
00096
00097 #define SCEXP0 1.0000000000000000f
00098 #define SCEXP1 0.6987082824680118f
00099 #define SCEXP2 0.2633174272827404f
00100 #define SCEXP3 0.0923611991471395f
00101 #define SCEXP4 0.0277520543324108f
00102
00103
00104 #define EXPOBIAS 127
00105 #define EXPOSHIFT 23
00106
00107
00108
00109 #define ACUTOFF -10
00110
00111 typedef union flint_t {
00112 float f;
00113 int n;
00114 } flint;
00115
00116
00117 typedef union NEONreg_t {
00118 float32x4_t f;
00119 int32x4_t i;
00120 struct {
00121 float r0, r1, r2, r3;
00122 } floatreg;
00123 } NEONreg;
00124
00125 void vmd_gaussdensity_neon(int verbose,
00126 int natoms, const float *xyzr,
00127 const float *atomicnum,
00128 const float *colors,
00129 float *densitymap, float *voltexmap,
00130 const int *numvoxels,
00131 float radscale, float gridspacing,
00132 float isovalue, float gausslim) {
00133 int i, x, y, z;
00134 int maxvoxel[3];
00135 maxvoxel[0] = numvoxels[0]-1;
00136 maxvoxel[1] = numvoxels[1]-1;
00137 maxvoxel[2] = numvoxels[2]-1;
00138 const float invgridspacing = 1.0f / gridspacing;
00139
00140
00141 float32x4_t gridspacing4_4;
00142 __attribute__((aligned(16))) float sxdelta4[4];
00143
00144 gridspacing4_4 = vdupq_n_f32(gridspacing * 4.0f);
00145 for (x=0; x<4; x++)
00146 sxdelta4[x] = ((float) x) * gridspacing;
00147
00148
00149 if (voltexmap != NULL) {
00150 float invisovalue = 1.0f / isovalue;
00151
00152 for (i=0; i<natoms; i++) {
00153 if (verbose && ((i & 0x3fff) == 0)) {
00154 printf(".");
00155 fflush(stdout);
00156 }
00157
00158 ptrdiff_t ind = i*4L;
00159 float scaledrad = xyzr[ind + 3] * radscale;
00160
00161
00162 float atomicnumfactor = 1.0f;
00163 if (atomicnum != NULL) {
00164 atomicnumfactor = atomicnum[i];
00165 }
00166
00167
00168 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00169 float radlim = gausslim * scaledrad;
00170 float radlim2 = radlim * radlim;
00171 radlim *= invgridspacing;
00172
00173 float32x4_t atomicnumfactor_4;
00174 float32x4_t arinv_4;
00175 atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00176
00177
00178 arinv_4 = vdupq_n_f32(arinv);
00179
00180 float tmp;
00181 tmp = xyzr[ind ] * invgridspacing;
00182 int xmin = MAX((int) (tmp - radlim), 0);
00183 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00184 tmp = xyzr[ind+1] * invgridspacing;
00185 int ymin = MAX((int) (tmp - radlim), 0);
00186 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00187 tmp = xyzr[ind+2] * invgridspacing;
00188 int zmin = MAX((int) (tmp - radlim), 0);
00189 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00190
00191 float dz = zmin*gridspacing - xyzr[ind+2];
00192 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00193 float dy = ymin*gridspacing - xyzr[ind+1];
00194 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00195 float dy2dz2 = dy*dy + dz*dz;
00196
00197
00198 if (dy2dz2 >= radlim2)
00199 continue;
00200
00201 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00202 float dx = xmin*gridspacing - xyzr[ind];
00203 x=xmin;
00204
00205
00206
00207 {
00208 __align(16) NEONreg n;
00209 __align(16) NEONreg y;
00210 float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00211 float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00212
00213 for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00214 float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00215 float32x4_t d;
00216
00217
00218 y.f = vmulq_f32(r2, arinv_4);
00219 n.i = vcvtq_s32_f32(y.f);
00220 d = vcvtq_f32_s32(n.i);
00221 d = vsubq_f32(d, y.f);
00222
00223
00224
00225 y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00226 y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00227 y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00228 y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00229
00230
00231
00232 n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00233 n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00234 y.f = vmulq_f32(y.f, n.f);
00235 y.f = vmulq_f32(y.f, atomicnumfactor_4);
00236
00237 float *ufptr = &densitymap[addr + x];
00238 d = vld1q_f32(ufptr);
00239 vst1q_f32(ufptr, vaddq_f32(d, y.f));
00240
00241
00242
00243
00244 d = vmulq_f32(y.f, vdupq_n_f32(invisovalue));
00245 ptrdiff_t caddr = (addr + x) * 3L;
00246
00247 #if 0
00248
00249 #else
00250
00251 float r, g, b;
00252 r = colors[ind ];
00253 g = colors[ind + 1];
00254 b = colors[ind + 2];
00255
00256 NEONreg tmp;
00257 tmp.f = d;
00258 float density;
00259 density = tmp.floatreg.r0;
00260 voltexmap[caddr ] += density * r;
00261 voltexmap[caddr + 1] += density * g;
00262 voltexmap[caddr + 2] += density * b;
00263
00264 density = tmp.floatreg.r1;
00265 voltexmap[caddr + 3] += density * r;
00266 voltexmap[caddr + 4] += density * g;
00267 voltexmap[caddr + 5] += density * b;
00268
00269 density = tmp.floatreg.r2;
00270 voltexmap[caddr + 6] += density * r;
00271 voltexmap[caddr + 7] += density * g;
00272 voltexmap[caddr + 8] += density * b;
00273
00274 density = tmp.floatreg.r3;
00275 voltexmap[caddr + 9] += density * r;
00276 voltexmap[caddr + 10] += density * g;
00277 voltexmap[caddr + 11] += density * b;
00278 #endif
00279 }
00280 }
00281
00282
00283 for (; x<=xmax; x++,dx+=gridspacing) {
00284 float r2 = dx*dx + dy2dz2;
00285
00286
00287 float mb = r2 * arinv;
00288 int mbflr = (int) mb;
00289 float d = mbflr - mb;
00290
00291
00292 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00293
00294
00295 flint scalfac;
00296 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00297
00298
00299 float density = (sy * scalfac.f);
00300
00301 density *= atomicnumfactor;
00302
00303
00304 densitymap[addr + x] += density;
00305
00306
00307
00308
00309 density *= invisovalue;
00310 ptrdiff_t caddr = (addr + x) * 3L;
00311
00312
00313 voltexmap[caddr ] += density * colors[ind ];
00314 voltexmap[caddr + 1] += density * colors[ind + 1];
00315 voltexmap[caddr + 2] += density * colors[ind + 2];
00316 }
00317 }
00318 }
00319 }
00320 } else {
00321
00322 for (i=0; i<natoms; i++) {
00323 if (verbose && ((i & 0x3fff) == 0)) {
00324 printf(".");
00325 fflush(stdout);
00326 }
00327
00328 ptrdiff_t ind = i*4L;
00329 float scaledrad = xyzr[ind+3] * radscale;
00330
00331
00332 float atomicnumfactor = 1.0f;
00333 if (atomicnum != NULL) {
00334 atomicnumfactor = atomicnum[i];
00335 }
00336
00337
00338 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00339 float radlim = gausslim * scaledrad;
00340 float radlim2 = radlim * radlim;
00341 radlim *= invgridspacing;
00342
00343 float32x4_t atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00344 float32x4_t arinv_4= vdupq_n_f32(arinv);
00345
00346 float tmp;
00347 tmp = xyzr[ind ] * invgridspacing;
00348 int xmin = MAX((int) (tmp - radlim), 0);
00349 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00350 tmp = xyzr[ind+1] * invgridspacing;
00351 int ymin = MAX((int) (tmp - radlim), 0);
00352 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00353 tmp = xyzr[ind+2] * invgridspacing;
00354 int zmin = MAX((int) (tmp - radlim), 0);
00355 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00356
00357 float dz = zmin*gridspacing - xyzr[ind+2];
00358 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00359 float dy = ymin*gridspacing - xyzr[ind+1];
00360 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00361 float dy2dz2 = dy*dy + dz*dz;
00362
00363
00364 if (dy2dz2 >= radlim2)
00365 continue;
00366
00367 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00368 float dx = xmin*gridspacing - xyzr[ind];
00369 x=xmin;
00370
00371
00372
00373 {
00374 __align(16) NEONreg n;
00375 __align(16) NEONreg y;
00376 float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00377 float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00378
00379 for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00380 float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00381 float32x4_t d;
00382
00383
00384 y.f = vmulq_f32(r2, arinv_4);
00385 n.i = vcvtq_s32_f32(y.f);
00386 d = vcvtq_f32_s32(n.i);
00387 d = vsubq_f32(d, y.f);
00388
00389
00390
00391 #if __ARM_FEATURE_FMA
00392
00393 y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00394 y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00395 y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00396 y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00397 #else
00398 y.f = vmulq_f32(d, vdupq_n_f32(SCEXP4));
00399 y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP3));
00400 y.f = vmulq_f32(y.f, d);
00401 y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP2));
00402 y.f = vmulq_f32(y.f, d);
00403 y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP1));
00404 y.f = vmulq_f32(y.f, d);
00405 y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP0));
00406 #endif
00407
00408
00409
00410 n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00411 n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00412 y.f = vmulq_f32(y.f, n.f);
00413 y.f = vmulq_f32(y.f, atomicnumfactor_4);
00414
00415 float *ufptr = &densitymap[addr + x];
00416 d = vld1q_f32(ufptr);
00417 vst1q_f32(ufptr, vaddq_f32(d, y.f));
00418 }
00419 }
00420
00421
00422 for (; x<=xmax; x++,dx+=gridspacing) {
00423 float r2 = dx*dx + dy2dz2;
00424
00425
00426 float mb = r2 * arinv;
00427 int mbflr = (int) mb;
00428 float d = mbflr - mb;
00429
00430
00431 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00432
00433
00434 flint scalfac;
00435 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00436
00437
00438 float density = (sy * scalfac.f);
00439
00440 density *= atomicnumfactor;
00441
00442 densitymap[addr + x] += density;
00443 }
00444 }
00445 }
00446 }
00447 }
00448 }
00449
00450
00451 #endif
00452