00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00022
00023
00024 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON)
00025 #include <arm_neon.h>
00026
00027 #include "WKFThreads.h"
00028
00029
00030 #include <math.h>
00031 #include <stdio.h>
00032 #include <stdlib.h>
00033 #include <stddef.h>
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045 #if defined(_WIN64)
00046 #define myintptrtype size_t
00047 #elif 1
00048 #define myintptrtype unsigned long
00049 #else
00050 #define myintptrtype uintptr_t
00051 #endif
00052
00053
00054
00055
00056
00057 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) {
00058 unsigned int alignmask = alignsz - 1;
00059 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask)));
00060 }
00061
00062
00063
00064 static float fmin_f32x4(float32x4_t min4) {
00065 float *f1 = (float *) &min4;
00066 float min1 = f1[0];
00067 if (f1[1] < min1) min1 = f1[1];
00068 if (f1[2] < min1) min1 = f1[2];
00069 if (f1[3] < min1) min1 = f1[3];
00070 return min1;
00071 }
00072
00073 static float fmax_f32x4(float32x4_t max4) {
00074 float *f1 = (float *) &max4;
00075 float max1 = f1[0];
00076 if (f1[1] > max1) max1 = f1[1];
00077 if (f1[2] > max1) max1 = f1[2];
00078 if (f1[3] > max1) max1 = f1[3];
00079 return max1;
00080 }
00081
00082 static double hadd_f64x2(float64x2_t sum2) {
00083 double *d = (double *) &sum2;
00084 return d[0] + d[1];
00085 }
00086
00087
00088
00089 void minmaxmean_1fv_aligned_neon(const float *f, ptrdiff_t n,
00090 float *fmin, float *fmax, float *fmean) {
00091 if (n < 1) {
00092 *fmin = 0.0f;
00093 *fmax = 0.0f;
00094 *fmean = 0.0f;
00095 return;
00096 }
00097
00098 float32x4_t minv = vdupq_n_f32(f[0]);
00099 float32x4_t maxv = minv;
00100 float64x2_t meanv = vdupq_n_f64(0.0);
00101
00102 for (ptrdiff_t i=0; i<n; i+=4) {
00103 float32x4_t tmp = vld1q_f32(&f[i]);
00104 minv = vminq_f32(minv, tmp);
00105 maxv = vmaxq_f32(maxv, tmp);
00106 meanv = vaddq_f64(meanv, vcvt_f64_f32(vget_high_f32(tmp)));
00107 meanv = vaddq_f64(meanv, vcvt_f64_f32(vget_high_f32(tmp)));
00108 }
00109
00110 *fmin = fmin_f32x4(minv);
00111 *fmax = fmax_f32x4(maxv);
00112 *fmean = hadd_f64x2(meanv) / double(n);
00113 }
00114
00115
00116
00117
00118 void minmax_1fv_aligned_neon(const float *f, ptrdiff_t n, float *fmin, float *fmax) {
00119 if (n < 1)
00120 return;
00121
00122 ptrdiff_t i=0;
00123 float min1 = f[0];
00124 float max1 = f[0];
00125
00126
00127 for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&f[i], 16)); i++) {
00128 if (f[i] < min1) min1 = f[i];
00129 if (f[i] > max1) max1 = f[i];
00130 }
00131
00132
00133 float32x4_t min4 = vdupq_n_f32(min1);
00134 float32x4_t max4 = vdupq_n_f32(max1);
00135
00136
00137 for (; i<(n-31); i+=32) {
00138 float32x4_t f4;
00139 f4 = vld1q_f32(&f[i ]);
00140 min4 = vminq_f32(min4, f4);
00141 max4 = vmaxq_f32(max4, f4);
00142 f4 = vld1q_f32(&f[i+ 4]);
00143 min4 = vminq_f32(min4, f4);
00144 max4 = vmaxq_f32(max4, f4);
00145 f4 = vld1q_f32(&f[i+ 8]);
00146 min4 = vminq_f32(min4, f4);
00147 max4 = vmaxq_f32(max4, f4);
00148 f4 = vld1q_f32(&f[i+12]);
00149 min4 = vminq_f32(min4, f4);
00150 max4 = vmaxq_f32(max4, f4);
00151
00152 f4 = vld1q_f32(&f[i+16]);
00153 min4 = vminq_f32(min4, f4);
00154 max4 = vmaxq_f32(max4, f4);
00155 f4 = vld1q_f32(&f[i+20]);
00156 min4 = vminq_f32(min4, f4);
00157 max4 = vmaxq_f32(max4, f4);
00158 f4 = vld1q_f32(&f[i+24]);
00159 min4 = vminq_f32(min4, f4);
00160 max4 = vmaxq_f32(max4, f4);
00161 f4 = vld1q_f32(&f[i+28]);
00162 min4 = vminq_f32(min4, f4);
00163 max4 = vmaxq_f32(max4, f4);
00164 }
00165
00166
00167 for (; i<(n-3); i+=4) {
00168 float32x4_t f4 = vld1q_f32(&f[i]);
00169 min4 = vminq_f32(min4, f4);
00170 max4 = vmaxq_f32(max4, f4);
00171 }
00172
00173
00174 for (; i<n; i++) {
00175 float32x4_t f4 = vdupq_n_f32(f[i]);
00176 min4 = vminq_f32(min4, f4);
00177 max4 = vmaxq_f32(max4, f4);
00178 }
00179
00180
00181
00182 *fmin = fmin_f32x4(min4);
00183 *fmax = fmax_f32x4(max4);
00184 }
00185
00186
00187
00188
00189 void minmax_3fv_aligned_neon(const float *f, const ptrdiff_t n3,
00190 float *fmin, float *fmax) {
00191 float minx, maxx, miny, maxy, minz, maxz;
00192 const ptrdiff_t end = n3*3L;
00193
00194 if (n3 < 1)
00195 return;
00196
00197 ptrdiff_t i=0;
00198 minx=maxx=f[i ];
00199 miny=maxy=f[i+1];
00200 minz=maxz=f[i+2];
00201
00202
00203
00204 for (; i<end; i+=3L) {
00205
00206 if (test_alignment_Nbyte_powertwo(&f[i], 16)) {
00207 break;
00208 }
00209
00210 float tmpx = f[i ];
00211 if (tmpx < minx) minx = tmpx;
00212 if (tmpx > maxx) maxx = tmpx;
00213
00214 float tmpy = f[i+1];
00215 if (tmpy < miny) miny = tmpy;
00216 if (tmpy > maxy) maxy = tmpy;
00217
00218 float tmpz = f[i+2];
00219 if (tmpz < minz) minz = tmpz;
00220 if (tmpz > maxz) maxz = tmpz;
00221 }
00222
00223
00224 float32x4_t xmin4 = vdupq_n_f32(minx);
00225 float32x4_t xmax4 = vdupq_n_f32(maxx);
00226 float32x4_t ymin4 = vdupq_n_f32(miny);
00227 float32x4_t ymax4 = vdupq_n_f32(maxy);
00228 float32x4_t zmin4 = vdupq_n_f32(minz);
00229 float32x4_t zmax4 = vdupq_n_f32(maxz);
00230
00231 for (; i<(end-11); i+=12) {
00232
00233
00234 float32x4x3_t soa = vld3q_f32(&f[i]);
00235
00236
00237 xmin4 = vminq_f32(xmin4, soa.val[0]);
00238 xmax4 = vmaxq_f32(xmax4, soa.val[0]);
00239 ymin4 = vminq_f32(ymin4, soa.val[1]);
00240 ymax4 = vmaxq_f32(ymax4, soa.val[1]);
00241 zmin4 = vminq_f32(zmin4, soa.val[2]);
00242 zmax4 = vmaxq_f32(zmax4, soa.val[2]);
00243 }
00244
00245 minx = fmin_f32x4(xmin4);
00246 miny = fmin_f32x4(ymin4);
00247 minz = fmin_f32x4(zmin4);
00248
00249 maxx = fmax_f32x4(xmax4);
00250 maxy = fmax_f32x4(ymax4);
00251 maxz = fmax_f32x4(zmax4);
00252
00253
00254 for (; i<end; i+=3) {
00255 float tmpx = f[i ];
00256 if (tmpx < minx) minx = tmpx;
00257 if (tmpx > maxx) maxx = tmpx;
00258
00259 float tmpy = f[i+1];
00260 if (tmpy < miny) miny = tmpy;
00261 if (tmpy > maxy) maxy = tmpy;
00262
00263 float tmpz = f[i+2];
00264 if (tmpz < minz) minz = tmpz;
00265 if (tmpz > maxz) maxz = tmpz;
00266 }
00267
00268 fmin[0] = minx;
00269 fmax[0] = maxx;
00270 fmin[1] = miny;
00271 fmax[1] = maxy;
00272 fmin[2] = minz;
00273 fmax[2] = maxz;
00274 }
00275
00276 #endif // CPUDISPATCH+NEON
00277