00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include <stddef.h>
00026 #include "WKFThreads.h"
00027
00028
00029 #if !defined(__PGIC__)
00030 #define VMDUSESSE 1 // enable SSE in combination with target macros
00031
00032 #endif
00033
00034
00035
00036
00037 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00038 #if !defined(__SSE2__) && defined(_WIN64)
00039 #define __SSE2__ 1
00040 #endif
00041 #endif
00042
00043
00044 #if defined(VMDUSESSE) && defined(__SSE2__)
00045 #include <emmintrin.h>
00046 #endif
00047 #if defined(VMDUSEAVX) && defined(__AVX__)
00048 #include <immintrin.h>
00049 #endif
00050 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00051 #include <arm_neon.h>
00052 #endif
00053 #if (defined(VMDUSEVSX) && defined(__VSX__))
00054 #if defined(__GNUC__) && defined(__VEC__)
00055
00056
00057 #include <altivec.h>
00058 #endif
00059 #endif
00060
00061
00062
00063 #include <math.h>
00064 #include <stdio.h>
00065 #include <stdlib.h>
00066
00067 #if defined(_MSC_VER)
00068 #include <windows.h>
00069 #include <conio.h>
00070 #else
00071 #include <unistd.h>
00072 #endif // _MSC_VER
00073
00074
00075 int analyze_selection_aligned_avx(int n, const int *on,
00076 int *firstsel, int *lastsel, int *selected);
00077 int analyze_selection_aligned_avx2(int n, const int *on,
00078 int *firstsel, int *lastsel, int *selected);
00079
00080 #if 0
00081
00082
00083
00084
00085
00086 void set_1fv_aligned(const int *iv, int n, const int val) {
00087 int i=0;
00088
00089 #if defined(VMDUSESSE) && defined(__SSE2__)
00090 __m128i = _mm_set_p
00091
00092 for (; i<(n-3); i+=4) {
00093 }
00094 #endif
00095 }
00096 #endif
00097
00098
00099 #if defined(VMDUSESSE) || defined(VMDUSEAVX) || defined(VMDUSEVSX) || defined(VMDUSENEON)
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110 #if defined(_WIN64)
00111 #define myintptrtype size_t
00112 #elif 1
00113 #define myintptrtype unsigned long
00114 #else
00115 #define myintptrtype uintptr_t
00116 #endif
00117
00118 #if 0
00119
00120 static int is_Nbyte_aligned(const void *ptr, int N) {
00121 return ((((myintptrtype) ptr) % N) == 0);
00122 }
00123 #endif
00124
00125 #if (defined(VMDUSESSE) && defined(__SSE2__)) || (defined(VMDUSEVSX) && defined(__VSX__)) || (defined(VMDUSEAVX) && defined(__AVX__)) || (defined(VMDUSENEON) && defined(__ARM_NEON__))
00126
00127 static int is_16byte_aligned(const void *ptr) {
00128 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0xf)));
00129 }
00130 #endif
00131
00132 #if defined(VMDUSEAVX)
00133
00134 static int is_32byte_aligned(const void *ptr) {
00135 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x1f)));
00136 }
00137 #endif
00138
00139 #if 0
00140
00141 static int is_64byte_aligned(const void *ptr) {
00142 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x3f)));
00143 }
00144 #endif
00145 #endif
00146
00147
00148
00149
00150
00151 #if defined(VMDUSESSE) && defined(__SSE2__)
00152
00153 #if 0
00154 static void print_m128i(__m128i mask4) {
00155 int * iv = (int *) &mask4;
00156 printf("vec: %08x %08x %08x %08x\n", iv[0], iv[1], iv[2], iv[3]);
00157 }
00158
00159 static int hand_m128i(__m128i mask4) {
00160 __m128i tmp = mask4;
00161 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00162 tmp = _mm_and_si128(mask4, tmp);
00163 mask4 = tmp;
00164 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00165 tmp = _mm_and_si128(mask4, tmp);
00166 mask4 = tmp;
00167
00168 int mask = _mm_cvtsi128_si32(mask4);
00169 return mask;
00170 }
00171 #endif
00172
00173
00174 static int hor_m128i(__m128i mask4) {
00175 #if 0
00176 int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mask4, _mm_set1_epi32(1)));
00177 #else
00178 __m128i tmp = mask4;
00179 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00180 tmp = _mm_or_si128(mask4, tmp);
00181 mask4 = tmp;
00182 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00183 tmp = _mm_or_si128(mask4, tmp);
00184 mask4 = tmp;
00185
00186 int mask = _mm_cvtsi128_si32(mask4);
00187 #endif
00188 return mask;
00189 }
00190
00191
00192 static int hadd_m128i(__m128i sum4) {
00193 __m128i tmp = sum4;
00194 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00195 tmp = _mm_add_epi32(sum4, tmp);
00196 sum4 = tmp;
00197 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00198 tmp = _mm_add_epi32(sum4, tmp);
00199 sum4 = tmp;
00200
00201 int sum = _mm_cvtsi128_si32(sum4);
00202 return sum;
00203 }
00204
00205
00206 #if 0
00207 static __m128i _mm_sel_m128i(const __m128i &a, const __m128i &b, const __m128i &mask) {
00208
00209 return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a)));
00210 }
00211 #endif
00212
00213
00214 static __m128 _mm_sel_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
00215
00216 return _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)));
00217 }
00218
00219
00220
00221 static float fmin_m128(__m128 min4) {
00222 __m128 tmp;
00223 tmp = min4;
00224 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00225 tmp = _mm_min_ps(min4, tmp);
00226 min4 = tmp;
00227 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00228 tmp = _mm_min_ps(min4, tmp);
00229 min4 = tmp;
00230
00231 float fmin;
00232 _mm_store_ss(&fmin, min4);
00233 return fmin;
00234 }
00235
00236
00237
00238 static float fmax_m128(__m128 max4) {
00239 __m128 tmp = max4;
00240 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00241 tmp = _mm_max_ps(max4, tmp);
00242 max4 = tmp;
00243 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00244 tmp = _mm_max_ps(max4, tmp);
00245 max4 = tmp;
00246
00247 float fmax;
00248 _mm_store_ss(&fmax, max4);
00249 return fmax;
00250 }
00251 #endif
00252
00253
00254
00255
00256
00257 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00258
00259
00260 static float fmin_f32x4(float32x4_t min4) {
00261 float *f1 = (float *) &min4;
00262 float min1 = f1[0];
00263 if (f1[1] < min1) min1 = f1[1];
00264 if (f1[2] < min1) min1 = f1[2];
00265 if (f1[3] < min1) min1 = f1[3];
00266 return min1;
00267 }
00268
00269 static float fmax_f32x4(float32x4_t max4) {
00270 float *f1 = (float *) &max4;
00271 float max1 = f1[0];
00272 if (f1[1] > max1) max1 = f1[1];
00273 if (f1[2] > max1) max1 = f1[2];
00274 if (f1[3] > max1) max1 = f1[3];
00275 return max1;
00276 }
00277
00278 #endif
00279
00280
00281
00282 int find_first_selection_aligned(int n, const int *on, int *firstsel) {
00283 int i;
00284 *firstsel = 0;
00285
00286
00287 #if defined(VMDUSESSE) && defined(__SSE2__)
00288
00289 for (i=0; ((i<n) && !is_16byte_aligned(&on[i])); i++) {
00290 if (on[i]) {
00291 *firstsel = i;
00292 return 0;
00293 }
00294 }
00295
00296
00297 for (; i<(n-3); i+=4) {
00298
00299 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00300 if (hor_m128i(on4))
00301 break;
00302 }
00303
00304 for (; i<n; i++) {
00305 if (on[i]) {
00306 *firstsel = i;
00307 return 0;
00308 }
00309 }
00310 #elif 0 && (defined(VMDUSEVSX) && defined(__VSX__))
00311
00312 for (i=0; ((i<n) && !is_16byte_aligned(&on[i])); i++) {
00313 if (on[i]) {
00314 *firstsel = i;
00315 return 0;
00316 }
00317 }
00318
00319
00320 for (; i<(n-3); i+=4) {
00321
00322 __vector signed int on4 = *((__vector signed int *) &on[i]);
00323 if (vec_extract(vec_max(on4, on4), 0))
00324 break;
00325 }
00326
00327 for (; i<n; i++) {
00328 if (on[i]) {
00329 *firstsel = i;
00330 return 0;
00331 }
00332 }
00333 #else
00334
00335 for (i=0; i<n; i++) {
00336 if (on[i]) {
00337 *firstsel = i;
00338 return 0;
00339 }
00340 }
00341 #endif
00342
00343
00344 *firstsel = 0;
00345 return -1;
00346 }
00347
00348
00349
00350 int find_last_selection_aligned(int n, const int *on, int *lastsel) {
00351 int i;
00352 *lastsel = -1;
00353
00354
00355 #if defined(VMDUSESSE) && defined(__SSE2__)
00356
00357
00358 for (i=n-1; i>=0; i--) {
00359 if (on[i]) {
00360 *lastsel = i;
00361 return 0;
00362 }
00363
00364
00365 if (is_16byte_aligned(&on[i]))
00366 break;
00367 }
00368
00369 for (i-=4; i>=0; i-=4) {
00370
00371 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00372 if (hor_m128i(on4))
00373 break;
00374 }
00375
00376 int last4=i;
00377 for (i=last4+3; i>=last4; i--) {
00378 if (on[i]) {
00379 *lastsel = i;
00380 return 0;
00381 }
00382 }
00383 #elif 0 && (defined(VMDUSEVSX) && defined(__VSX__))
00384
00385
00386 for (i=n-1; i>=0; i--) {
00387 if (on[i]) {
00388 *lastsel = i;
00389 return 0;
00390 }
00391
00392
00393 if (is_16byte_aligned(&on[i]))
00394 break;
00395 }
00396
00397 for (i-=4; i>=0; i-=4) {
00398
00399 __vector signed int on4 = *((__vector signed int *) &on[i]);
00400 if (vec_extract(vec_max(on4, on4), 0))
00401 break;
00402 }
00403
00404 int last4=i;
00405 for (i=last4+3; i>=last4; i--) {
00406 if (on[i]) {
00407 *lastsel = i;
00408 return 0;
00409 }
00410 }
00411 #else
00412
00413 for (i=n-1; i>=0; i--) {
00414 if (on[i]) {
00415 *lastsel = i;
00416 return 0;
00417 }
00418 }
00419 #endif
00420
00421
00422 *lastsel = -1;
00423 return -1;
00424 }
00425
00426
00427
00428
00429 int analyze_selection_aligned(int n, const int *on,
00430 int *firstsel, int *lastsel, int *selected) {
00431 int sel = 0;
00432 int first = 0;
00433 int last = -1;
00434 int i;
00435
00436
00437 if (selected != NULL)
00438 *selected = sel;
00439
00440 if (firstsel != NULL)
00441 *firstsel = first;
00442
00443 if (lastsel != NULL)
00444 *lastsel = last;
00445
00446
00447 if ((firstsel != NULL) || (selected != NULL)) {
00448 if (find_first_selection_aligned(n, on, &first)) {
00449 return -1;
00450 }
00451 }
00452
00453
00454 if ((lastsel != NULL) || (selected != NULL)) {
00455 if (find_last_selection_aligned(n, on, &last)) {
00456 return -1;
00457 }
00458 }
00459
00460
00461
00462 if (selected != NULL) {
00463
00464
00465
00466 #if !defined(__INTEL_COMPILER) && defined(VMDUSESSE) && defined(__SSE2__)
00467
00468
00469 for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00470 sel += on[i];
00471 }
00472
00473 #if 1
00474
00475 __m128i sum4 = _mm_setzero_si128();
00476 for (; i<=(last-3); i+=4) {
00477
00478 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00479
00480
00481 sum4 = _mm_add_epi32(sum4, on4);
00482 }
00483 sel += hadd_m128i(sum4);
00484 #else
00485
00486 for (; i<=(last-3); i+=4) {
00487
00488 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00489
00490
00491 sel += hadd_m128i(on4);
00492 }
00493 #endif
00494
00495
00496 for (; i<=last; i++) {
00497 sel += on[i];
00498 }
00499 #elif 1 && (defined(VMDUSEVSX) && defined(__VSX__))
00500
00501
00502 for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00503 sel += on[i];
00504 }
00505
00506
00507 vector signed int cnt4 = vec_splat_s32(0);
00508 for (; i<=(last-3); i+=4) {
00509
00510 vector signed int on4 = *((__vector signed int *) &on[i]);
00511
00512
00513 cnt4 = vec_add(cnt4, on4);
00514 }
00515 sel += vec_extract(cnt4, 0) + vec_extract(cnt4, 1) +
00516 vec_extract(cnt4, 2) + vec_extract(cnt4, 3);
00517
00518
00519 for (; i<=last; i++) {
00520 sel += on[i];
00521 }
00522 #else
00523
00524 for (i=first; i<=last; i++) {
00525 sel += on[i];
00526 }
00527 #endif
00528 }
00529
00530 if (selected != NULL)
00531 *selected = sel;
00532
00533 if (firstsel != NULL)
00534 *firstsel = first;
00535
00536 if (lastsel != NULL)
00537 *lastsel = last;
00538
00539 return 0;
00540 }
00541
00542
00543
00544
00545 int analyze_selection_aligned_dispatch(wkf_cpu_caps_t *cpucaps,
00546 int n, const int *on,
00547 int *firstsel, int *lastsel,
00548 int *selected) {
00549 int sel = 0;
00550 int first = 0;
00551 int last = -1;
00552 int i;
00553
00554 #if defined(VMDCPUDISPATCH)
00555 if (cpucaps != NULL) {
00556
00557
00558
00559 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00560 if (cpucaps->flags & CPU_AVX2) {
00561
00562 return analyze_selection_aligned_avx2(n, on, firstsel, lastsel, selected);
00563 }
00564
00565 if (cpucaps->flags & CPU_AVX) {
00566
00567 return analyze_selection_aligned_avx(n, on, firstsel, lastsel, selected);
00568 }
00569 #endif
00570
00571 }
00572 #endif
00573
00574
00575 if (selected != NULL)
00576 *selected = sel;
00577
00578 if (firstsel != NULL)
00579 *firstsel = first;
00580
00581 if (lastsel != NULL)
00582 *lastsel = last;
00583
00584
00585 if ((firstsel != NULL) || (selected != NULL)) {
00586 if (find_first_selection_aligned(n, on, &first)) {
00587 return -1;
00588 }
00589 }
00590
00591
00592 if ((lastsel != NULL) || (selected != NULL)) {
00593 if (find_last_selection_aligned(n, on, &last)) {
00594 return -1;
00595 }
00596 }
00597
00598
00599
00600 if (selected != NULL) {
00601
00602
00603
00604 #if !defined(__INTEL_COMPILER) && defined(VMDUSESSE) && defined(__SSE2__)
00605
00606
00607 for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00608 sel += on[i];
00609 }
00610
00611
00612 __m128i sum4 = _mm_setzero_si128();
00613 for (; i<=(last-3); i+=4) {
00614
00615 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00616
00617
00618 sum4 = _mm_add_epi32(sum4, on4);
00619 }
00620 sel += hadd_m128i(sum4);
00621
00622
00623 for (; i<=last; i++) {
00624 sel += on[i];
00625 }
00626 #elif 1 && (defined(VMDUSEVSX) && defined(__VSX__))
00627
00628
00629 for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00630 sel += on[i];
00631 }
00632
00633
00634 vector signed int cnt4 = vec_splat_s32(0);
00635 for (; i<=(last-3); i+=4) {
00636
00637 vector signed int on4 = *((__vector signed int *) &on[i]);
00638
00639
00640 cnt4 = vec_add(cnt4, on4);
00641 }
00642 sel += vec_extract(cnt4, 0) + vec_extract(cnt4, 1) +
00643 vec_extract(cnt4, 2) + vec_extract(cnt4, 3);
00644
00645
00646 for (; i<=last; i++) {
00647 sel += on[i];
00648 }
00649 #else
00650
00651 for (i=first; i<=last; i++) {
00652 sel += on[i];
00653 }
00654 #endif
00655 }
00656
00657 if (selected != NULL)
00658 *selected = sel;
00659
00660 if (firstsel != NULL)
00661 *firstsel = first;
00662
00663 if (lastsel != NULL)
00664 *lastsel = last;
00665
00666 return 0;
00667 }
00668
00669
00670
00671 void minmaxmean_1fv_aligned(const float *f, ptrdiff_t n,
00672 float *fmin, float *fmax, float *fmean) {
00673 if (n < 1) {
00674 *fmin = 0.0f;
00675 *fmax = 0.0f;
00676 *fmean = 0.0f;
00677 return;
00678 }
00679
00680 #if defined(VMDUSEAVX) && defined(__AVX__)
00681 ptrdiff_t i=0;
00682 float min1 = f[0];
00683 float max1 = f[0];
00684 double mean1 = f[0];
00685
00686
00687 for (i=0; ((i<n) && !is_32byte_aligned(&f[i])); i++) {
00688 if (f[i] < min1) min1 = f[i];
00689 if (f[i] > max1) max1 = f[i];
00690 mean1 += f[i];
00691 }
00692
00693
00694 __m256 min8 = _mm256_set1_ps(min1);
00695 __m256 max8 = _mm256_set1_ps(max1);
00696 __m256d mean4d = _mm256_set1_pd(0.0);
00697
00698
00699 for (; i<(n-63); i+=64) {
00700 __m256 f8 = _mm256_load_ps(&f[i]);
00701 min8 = _mm256_min_ps(min8, f8);
00702 max8 = _mm256_max_ps(max8, f8);
00703 __m256 mean8 = f8;
00704 f8 = _mm256_load_ps(&f[i+8]);
00705 min8 = _mm256_min_ps(min8, f8);
00706 max8 = _mm256_max_ps(max8, f8);
00707 mean8 = _mm256_add_ps(mean8, f8);
00708 f8 = _mm256_load_ps(&f[i+16]);
00709 min8 = _mm256_min_ps(min8, f8);
00710 max8 = _mm256_max_ps(max8, f8);
00711 mean8 = _mm256_add_ps(mean8, f8);
00712 f8 = _mm256_load_ps(&f[i+24]);
00713 min8 = _mm256_min_ps(min8, f8);
00714 max8 = _mm256_max_ps(max8, f8);
00715 mean8 = _mm256_add_ps(mean8, f8);
00716
00717 f8 = _mm256_load_ps(&f[i+32]);
00718 min8 = _mm256_min_ps(min8, f8);
00719 max8 = _mm256_max_ps(max8, f8);
00720 mean8 = _mm256_add_ps(mean8, f8);
00721 f8 = _mm256_load_ps(&f[i+40]);
00722 min8 = _mm256_min_ps(min8, f8);
00723 max8 = _mm256_max_ps(max8, f8);
00724 mean8 = _mm256_add_ps(mean8, f8);
00725 f8 = _mm256_load_ps(&f[i+48]);
00726 min8 = _mm256_min_ps(min8, f8);
00727 max8 = _mm256_max_ps(max8, f8);
00728 mean8 = _mm256_add_ps(mean8, f8);
00729 f8 = _mm256_load_ps(&f[i+56]);
00730 min8 = _mm256_min_ps(min8, f8);
00731 max8 = _mm256_max_ps(max8, f8);
00732 mean8 = _mm256_add_ps(mean8, f8);
00733
00734
00735
00736
00737
00738 mean4d = _mm256_add_pd(mean4d, _mm256_cvtps_pd(_mm256_extractf128_ps(mean8, 0)));
00739 mean4d = _mm256_add_pd(mean4d, _mm256_cvtps_pd(_mm256_extractf128_ps(mean8, 1)));
00740 }
00741
00742
00743
00744 __m256d pairs4d = _mm256_hadd_pd(mean4d, mean4d);
00745 mean4d = _mm256_add_pd(pairs4d,
00746 _mm256_permute2f128_pd(pairs4d, pairs4d, 0x01));
00747 #if defined(__AVX2__)
00748 mean1 += _mm256_cvtsd_f64(mean4d);
00749 #else
00750 double tmp;
00751 _mm_storel_pd(&tmp, _mm256_castpd256_pd128(mean4d));
00752 mean1 += tmp;
00753 #endif
00754
00755
00756 for (; i<n; i++) {
00757 __m256 f8 = _mm256_set1_ps(f[i]);
00758 min8 = _mm256_min_ps(min8, f8);
00759 max8 = _mm256_max_ps(max8, f8);
00760 mean1 += f[i];
00761 }
00762
00763
00764
00765 float t0, t1;
00766 t0 = fmin_m128(_mm256_extractf128_ps(min8, 0));
00767 t1 = fmin_m128(_mm256_extractf128_ps(min8, 1));
00768 *fmin = (t0 < t1) ? t0 : t1;
00769
00770 t0 = fmax_m128(_mm256_extractf128_ps(max8, 0));
00771 t1 = fmax_m128(_mm256_extractf128_ps(max8, 1));
00772 *fmax = (t0 > t1) ? t0 : t1;
00773 *fmean = mean1 / n;
00774 #elif defined(VMDUSESSE) && defined(__SSE2__) && (defined(__GNUC__) || defined(__INTEL_COMPILER))
00775 ptrdiff_t i=0;
00776 float min1 = f[0];
00777 float max1 = f[0];
00778 double mean1 = f[0];
00779
00780
00781 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00782 if (f[i] < min1) min1 = f[i];
00783 if (f[i] > max1) max1 = f[i];
00784 mean1 += f[i];
00785 }
00786
00787
00788 __m128 min4 = _mm_set_ps1(min1);
00789 __m128 max4 = _mm_set_ps1(max1);
00790 #if defined(__clang__) && (__clang_major__ < 5)
00791
00792 __m128d mean2d = _mm_cvtps_pd(_mm_set_ps1(0.0f));
00793 #else
00794
00795 __m128d mean2d = _mm_set1_pd(0.0);
00796 #endif
00797
00798
00799 for (; i<(n-31); i+=32) {
00800 __m128 f4 = _mm_load_ps(&f[i]);
00801 min4 = _mm_min_ps(min4, f4);
00802 max4 = _mm_max_ps(max4, f4);
00803 __m128 mean4 = f4;
00804 f4 = _mm_load_ps(&f[i+4]);
00805 min4 = _mm_min_ps(min4, f4);
00806 max4 = _mm_max_ps(max4, f4);
00807 mean4 = _mm_add_ps(mean4, f4);
00808 f4 = _mm_load_ps(&f[i+8]);
00809 min4 = _mm_min_ps(min4, f4);
00810 max4 = _mm_max_ps(max4, f4);
00811 mean4 = _mm_add_ps(mean4, f4);
00812 f4 = _mm_load_ps(&f[i+12]);
00813 min4 = _mm_min_ps(min4, f4);
00814 max4 = _mm_max_ps(max4, f4);
00815 mean4 = _mm_add_ps(mean4, f4);
00816
00817 f4 = _mm_load_ps(&f[i+16]);
00818 min4 = _mm_min_ps(min4, f4);
00819 max4 = _mm_max_ps(max4, f4);
00820 mean4 = _mm_add_ps(mean4, f4);
00821 f4 = _mm_load_ps(&f[i+20]);
00822 min4 = _mm_min_ps(min4, f4);
00823 max4 = _mm_max_ps(max4, f4);
00824 mean4 = _mm_add_ps(mean4, f4);
00825 f4 = _mm_load_ps(&f[i+24]);
00826 min4 = _mm_min_ps(min4, f4);
00827 max4 = _mm_max_ps(max4, f4);
00828 mean4 = _mm_add_ps(mean4, f4);
00829 f4 = _mm_load_ps(&f[i+28]);
00830 min4 = _mm_min_ps(min4, f4);
00831 max4 = _mm_max_ps(max4, f4);
00832 mean4 = _mm_add_ps(mean4, f4);
00833
00834
00835
00836
00837
00838 mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(mean4));
00839 mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(_mm_shuffle_ps(mean4, mean4, _MM_SHUFFLE(3, 2, 3, 2))));
00840 }
00841
00842
00843 for (; i<(n-3); i+=4) {
00844 __m128 f4 = _mm_load_ps(&f[i]);
00845 min4 = _mm_min_ps(min4, f4);
00846 max4 = _mm_max_ps(max4, f4);
00847
00848
00849
00850
00851
00852 mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(f4));
00853 mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(_mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3, 2, 3, 2))));
00854 }
00855
00856
00857
00858 mean2d = _mm_add_pd(mean2d, _mm_shuffle_pd(mean2d, mean2d, 1));
00859 double tmp;
00860 _mm_storel_pd(&tmp, mean2d);
00861 mean1 += tmp;
00862
00863
00864 for (; i<n; i++) {
00865 __m128 f4 = _mm_set_ps1(f[i]);
00866 min4 = _mm_min_ps(min4, f4);
00867 max4 = _mm_max_ps(max4, f4);
00868 mean1 += f[i];
00869 }
00870
00871
00872
00873 *fmin = fmin_m128(min4);
00874 *fmax = fmax_m128(max4);
00875 *fmean = mean1 / n;
00876 #else
00877
00878 float min1 = f[0];
00879 float max1 = f[0];
00880 double mean1 = f[0];
00881 for (ptrdiff_t i=1; i<n; i++) {
00882 if (f[i] < min1) min1 = f[i];
00883 if (f[i] > max1) max1 = f[i];
00884 mean1 += f[i];
00885 }
00886 *fmin = min1;
00887 *fmax = max1;
00888 *fmean = float(mean1 / n);
00889 #endif
00890 }
00891
00892
00893
00894 void minmax_1fv_aligned(const float *f, ptrdiff_t n, float *fmin, float *fmax) {
00895 if (n < 1)
00896 return;
00897
00898 #if defined(VMDUSESSE) && defined(__SSE2__)
00899 ptrdiff_t i=0;
00900 float min1 = f[0];
00901 float max1 = f[0];
00902
00903
00904 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00905 if (f[i] < min1) min1 = f[i];
00906 if (f[i] > max1) max1 = f[i];
00907 }
00908
00909
00910 __m128 min4 = _mm_set_ps1(min1);
00911 __m128 max4 = _mm_set_ps1(max1);
00912
00913
00914 for (; i<(n-31); i+=32) {
00915 __m128 f4 = _mm_load_ps(&f[i]);
00916 min4 = _mm_min_ps(min4, f4);
00917 max4 = _mm_max_ps(max4, f4);
00918 f4 = _mm_load_ps(&f[i+4]);
00919 min4 = _mm_min_ps(min4, f4);
00920 max4 = _mm_max_ps(max4, f4);
00921 f4 = _mm_load_ps(&f[i+8]);
00922 min4 = _mm_min_ps(min4, f4);
00923 max4 = _mm_max_ps(max4, f4);
00924 f4 = _mm_load_ps(&f[i+12]);
00925 min4 = _mm_min_ps(min4, f4);
00926 max4 = _mm_max_ps(max4, f4);
00927
00928 f4 = _mm_load_ps(&f[i+16]);
00929 min4 = _mm_min_ps(min4, f4);
00930 max4 = _mm_max_ps(max4, f4);
00931 f4 = _mm_load_ps(&f[i+20]);
00932 min4 = _mm_min_ps(min4, f4);
00933 max4 = _mm_max_ps(max4, f4);
00934 f4 = _mm_load_ps(&f[i+24]);
00935 min4 = _mm_min_ps(min4, f4);
00936 max4 = _mm_max_ps(max4, f4);
00937 f4 = _mm_load_ps(&f[i+28]);
00938 min4 = _mm_min_ps(min4, f4);
00939 max4 = _mm_max_ps(max4, f4);
00940 }
00941
00942
00943 for (; i<(n-3); i+=4) {
00944 __m128 f4 = _mm_load_ps(&f[i]);
00945 min4 = _mm_min_ps(min4, f4);
00946 max4 = _mm_max_ps(max4, f4);
00947 }
00948
00949
00950 for (; i<n; i++) {
00951 __m128 f4 = _mm_set_ps1(f[i]);
00952 min4 = _mm_min_ps(min4, f4);
00953 max4 = _mm_max_ps(max4, f4);
00954 }
00955
00956
00957
00958 *fmin = fmin_m128(min4);
00959 *fmax = fmax_m128(max4);
00960 #elif defined(VMDUSEVSX) && defined(__VSX__)
00961 ptrdiff_t i=0;
00962 float min1 = f[0];
00963 float max1 = f[0];
00964
00965
00966 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00967 if (f[i] < min1) min1 = f[i];
00968 if (f[i] > max1) max1 = f[i];
00969 }
00970
00971
00972 vector float min4 = vec_splats(min1);
00973 vector float max4 = vec_splats(max1);
00974
00975
00976 for (; i<(n-3); i+=4) {
00977 vector float f4 = *((vector float *) &f[i]);
00978 min4 = vec_min(min4, f4);
00979 max4 = vec_max(max4, f4);
00980 }
00981
00982
00983 for (; i<n; i++) {
00984 vector float f4 = vec_splats(f[i]);
00985 min4 = vec_min(min4, f4);
00986 max4 = vec_max(max4, f4);
00987 }
00988
00989
00990
00991 min1 = min4[0];
00992 min1 = (min1 < min4[1]) ? min1 : min4[1];
00993 min1 = (min1 < min4[2]) ? min1 : min4[2];
00994 min1 = (min1 < min4[3]) ? min1 : min4[3];
00995
00996 max1 = max4[0];
00997 max1 = (max1 < max4[1]) ? max1 : max4[1];
00998 max1 = (max1 < max4[2]) ? max1 : max4[2];
00999 max1 = (max1 < max4[3]) ? max1 : max4[3];
01000
01001 *fmin = min1;
01002 *fmax = max1;
01003 #elif defined(VMDUSENEON) && defined(__ARM_NEON__)
01004 ptrdiff_t i=0;
01005 float min1 = f[0];
01006 float max1 = f[0];
01007
01008
01009 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
01010 if (f[i] < min1) min1 = f[i];
01011 if (f[i] > max1) max1 = f[i];
01012 }
01013
01014
01015 float32x4_t min4 = vdupq_n_f32(min1);
01016 float32x4_t max4 = vdupq_n_f32(max1);
01017
01018
01019 for (; i<(n-31); i+=32) {
01020 float32x4_t f4;
01021 f4 = vld1q_f32(&f[i ]);
01022 min4 = vminq_f32(min4, f4);
01023 max4 = vmaxq_f32(max4, f4);
01024 f4 = vld1q_f32(&f[i+ 4]);
01025 min4 = vminq_f32(min4, f4);
01026 max4 = vmaxq_f32(max4, f4);
01027 f4 = vld1q_f32(&f[i+ 8]);
01028 min4 = vminq_f32(min4, f4);
01029 max4 = vmaxq_f32(max4, f4);
01030 f4 = vld1q_f32(&f[i+12]);
01031 min4 = vminq_f32(min4, f4);
01032 max4 = vmaxq_f32(max4, f4);
01033
01034 f4 = vld1q_f32(&f[i+16]);
01035 min4 = vminq_f32(min4, f4);
01036 max4 = vmaxq_f32(max4, f4);
01037 f4 = vld1q_f32(&f[i+20]);
01038 min4 = vminq_f32(min4, f4);
01039 max4 = vmaxq_f32(max4, f4);
01040 f4 = vld1q_f32(&f[i+24]);
01041 min4 = vminq_f32(min4, f4);
01042 max4 = vmaxq_f32(max4, f4);
01043 f4 = vld1q_f32(&f[i+28]);
01044 min4 = vminq_f32(min4, f4);
01045 max4 = vmaxq_f32(max4, f4);
01046 }
01047
01048
01049 for (; i<(n-3); i+=4) {
01050 float32x4_t f4 = vld1q_f32(&f[i]);
01051 min4 = vminq_f32(min4, f4);
01052 max4 = vmaxq_f32(max4, f4);
01053 }
01054
01055
01056 for (; i<n; i++) {
01057 float32x4_t f4 = vdupq_n_f32(f[i]);
01058 min4 = vminq_f32(min4, f4);
01059 max4 = vmaxq_f32(max4, f4);
01060 }
01061
01062
01063
01064 *fmin = fmin_f32x4(min4);
01065 *fmax = fmax_f32x4(max4);
01066 #else
01067
01068 float min1 = f[0];
01069 float max1 = f[0];
01070 for (ptrdiff_t i=1; i<n; i++) {
01071 if (f[i] < min1) min1 = f[i];
01072 if (f[i] > max1) max1 = f[i];
01073 }
01074 *fmin = min1;
01075 *fmax = max1;
01076 #endif
01077 }
01078
01079
01080
01081
01082 void minmax_3fv_aligned(const float *f, const ptrdiff_t n3,
01083 float *fmin, float *fmax) {
01084 float minx, maxx, miny, maxy, minz, maxz;
01085 const ptrdiff_t end = n3*3L;
01086
01087 if (n3 < 1)
01088 return;
01089
01090 ptrdiff_t i=0;
01091 minx=maxx=f[i ];
01092 miny=maxy=f[i+1];
01093 minz=maxz=f[i+2];
01094
01095 #if defined(VMDUSESSE) && defined(__SSE2__)
01096
01097
01098 for (; i<end; i+=3L) {
01099
01100 if (is_16byte_aligned(&f[i])) {
01101 break;
01102 }
01103
01104 float tmpx = f[i ];
01105 if (tmpx < minx) minx = tmpx;
01106 if (tmpx > maxx) maxx = tmpx;
01107
01108 float tmpy = f[i+1];
01109 if (tmpy < miny) miny = tmpy;
01110 if (tmpy > maxy) maxy = tmpy;
01111
01112 float tmpz = f[i+2];
01113 if (tmpz < minz) minz = tmpz;
01114 if (tmpz > maxz) maxz = tmpz;
01115 }
01116
01117
01118 __m128 xmin4 = _mm_set_ps1(minx);
01119 __m128 xmax4 = _mm_set_ps1(maxx);
01120 __m128 ymin4 = _mm_set_ps1(miny);
01121 __m128 ymax4 = _mm_set_ps1(maxy);
01122 __m128 zmin4 = _mm_set_ps1(minz);
01123 __m128 zmax4 = _mm_set_ps1(maxz);
01124
01125 for (; i<(end-11); i+=12) {
01126
01127
01128 __m128 x0y0z0x1 = _mm_load_ps(&f[i ]);
01129 __m128 y1z1x2y2 = _mm_load_ps(&f[i+4]);
01130 __m128 z2x3y3z3 = _mm_load_ps(&f[i+8]);
01131
01132
01133 __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
01134 __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
01135 __m128 x = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
01136 __m128 y = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
01137 __m128 z = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
01138
01139
01140 xmin4 = _mm_min_ps(xmin4, x);
01141 xmax4 = _mm_max_ps(xmax4, x);
01142 ymin4 = _mm_min_ps(ymin4, y);
01143 ymax4 = _mm_max_ps(ymax4, y);
01144 zmin4 = _mm_min_ps(zmin4, z);
01145 zmax4 = _mm_max_ps(zmax4, z);
01146 }
01147
01148 minx = fmin_m128(xmin4);
01149 miny = fmin_m128(ymin4);
01150 minz = fmin_m128(zmin4);
01151
01152 maxx = fmax_m128(xmax4);
01153 maxy = fmax_m128(ymax4);
01154 maxz = fmax_m128(zmax4);
01155 #endif
01156
01157
01158 for (; i<end; i+=3) {
01159 float tmpx = f[i ];
01160 if (tmpx < minx) minx = tmpx;
01161 if (tmpx > maxx) maxx = tmpx;
01162
01163 float tmpy = f[i+1];
01164 if (tmpy < miny) miny = tmpy;
01165 if (tmpy > maxy) maxy = tmpy;
01166
01167 float tmpz = f[i+2];
01168 if (tmpz < minz) minz = tmpz;
01169 if (tmpz > maxz) maxz = tmpz;
01170 }
01171
01172 fmin[0] = minx;
01173 fmax[0] = maxx;
01174 fmin[1] = miny;
01175 fmax[1] = maxy;
01176 fmin[2] = minz;
01177 fmax[2] = maxz;
01178 }
01179
01180
01181
01182
01183 int minmax_selected_3fv_aligned(const float *f, const int *on,
01184 const ptrdiff_t n3,
01185 const ptrdiff_t firstsel,
01186 const ptrdiff_t lastsel,
01187 float *fmin, float *fmax) {
01188 float minx, maxx, miny, maxy, minz, maxz;
01189
01190 if ((n3 < 1) || (firstsel < 0) || (lastsel < firstsel) || (lastsel >= n3))
01191 return -1;
01192
01193
01194 ptrdiff_t i=firstsel;
01195 minx=maxx=f[i*3L ];
01196 miny=maxy=f[i*3L+1];
01197 minz=maxz=f[i*3L+2];
01198
01199 ptrdiff_t end=lastsel+1;
01200
01201
01202
01203
01204 #if defined(VMDUSESSE) && defined(__SSE2__)
01205
01206
01207 for (; i<end; i++) {
01208 ptrdiff_t ind3 = i * 3L;
01209
01210 #if 1
01211
01212
01213
01214
01215 if (is_16byte_aligned(&f[ind3])) {
01216 break;
01217 }
01218 #else
01219
01220 if (is_16byte_aligned(&on[i]) && is_16byte_aligned(&f[ind3])) {
01221
01222
01223 break;
01224 }
01225 #endif
01226
01227 if (on[i]) {
01228 float tmpx = f[ind3 ];
01229 if (tmpx < minx) minx = tmpx;
01230 if (tmpx > maxx) maxx = tmpx;
01231
01232 float tmpy = f[ind3+1];
01233 if (tmpy < miny) miny = tmpy;
01234 if (tmpy > maxy) maxy = tmpy;
01235
01236 float tmpz = f[ind3+2];
01237 if (tmpz < minz) minz = tmpz;
01238 if (tmpz > maxz) maxz = tmpz;
01239 }
01240 }
01241
01242
01243 __m128 xmin4 = _mm_set_ps1(minx);
01244 __m128 xmax4 = _mm_set_ps1(maxx);
01245 __m128 ymin4 = _mm_set_ps1(miny);
01246 __m128 ymax4 = _mm_set_ps1(maxy);
01247 __m128 zmin4 = _mm_set_ps1(minz);
01248 __m128 zmax4 = _mm_set_ps1(maxz);
01249
01250 for (; i<(end-3); i+=4) {
01251 #if 1
01252
01253
01254 __m128i on4 = _mm_loadu_si128((__m128i*) &on[i]);
01255 #else
01256
01257 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
01258 #endif
01259
01260
01261 __m128i mask = _mm_cmpeq_epi32(_mm_set1_epi32(1), on4);
01262 if (!hor_m128i(mask))
01263 continue;
01264
01265
01266
01267 ptrdiff_t ind3 = i * 3L;
01268 __m128 x0y0z0x1 = _mm_load_ps(&f[ind3+0]);
01269 __m128 y1z1x2y2 = _mm_load_ps(&f[ind3+4]);
01270 __m128 z2x3y3z3 = _mm_load_ps(&f[ind3+8]);
01271
01272
01273 __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
01274 __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
01275 __m128 x = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
01276 __m128 y = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
01277 __m128 z = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
01278
01279
01280 xmin4 = _mm_sel_ps(xmin4, _mm_min_ps(xmin4, x), _mm_castsi128_ps(mask));
01281 xmax4 = _mm_sel_ps(xmax4, _mm_max_ps(xmax4, x), _mm_castsi128_ps(mask));
01282 ymin4 = _mm_sel_ps(ymin4, _mm_min_ps(ymin4, y), _mm_castsi128_ps(mask));
01283 ymax4 = _mm_sel_ps(ymax4, _mm_max_ps(ymax4, y), _mm_castsi128_ps(mask));
01284 zmin4 = _mm_sel_ps(zmin4, _mm_min_ps(zmin4, z), _mm_castsi128_ps(mask));
01285 zmax4 = _mm_sel_ps(zmax4, _mm_max_ps(zmax4, z), _mm_castsi128_ps(mask));
01286 }
01287
01288 minx = fmin_m128(xmin4);
01289 miny = fmin_m128(ymin4);
01290 minz = fmin_m128(zmin4);
01291
01292 maxx = fmax_m128(xmax4);
01293 maxy = fmax_m128(ymax4);
01294 maxz = fmax_m128(zmax4);
01295 #endif
01296
01297
01298 for (; i<end; i++) {
01299 if (on[i]) {
01300 ptrdiff_t ind3 = i * 3L;
01301 float tmpx = f[ind3 ];
01302 if (tmpx < minx) minx = tmpx;
01303 if (tmpx > maxx) maxx = tmpx;
01304
01305 float tmpy = f[ind3+1];
01306 if (tmpy < miny) miny = tmpy;
01307 if (tmpy > maxy) maxy = tmpy;
01308
01309 float tmpz = f[ind3+2];
01310 if (tmpz < minz) minz = tmpz;
01311 if (tmpz > maxz) maxz = tmpz;
01312 }
01313 }
01314
01315 fmin[0] = minx;
01316 fmax[0] = maxx;
01317 fmin[1] = miny;
01318 fmax[1] = maxy;
01319 fmin[2] = minz;
01320 fmax[2] = maxz;
01321
01322 return 0;
01323 }
01324
01325