Ignore:
Timestamp:
Aug 2, 2010, 9:27:30 PM (15 years ago)
Author:
Dmitry A. Kuminov
Message:

trunk: Merged in qt 4.6.3 sources from branches/vendor/nokia/qt.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk

  • trunk/src/gui/painting/qdrawhelper_neon.cpp

    r651 r769  
    4949QT_BEGIN_NAMESPACE
    5050
    51 static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half)
     51static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
    5252{
    5353    // result = (x + (x >> 8) + 0x80) >> 8
    5454
    55     const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8
    56     const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80
    57     const int16x8_t sum = vaddq_s16(temp, sum_part);
    58 
    59     return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8));
    60 }
    61 
    62 static inline int16x8_t qvbyte_mul_s16(int16x8_t x, int16x8_t alpha, int16x8_t half)
     55    const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
     56    const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
     57    const uint16x8_t sum = vaddq_u16(temp, sum_part);
     58
     59    return vshrq_n_u16(sum, 8);
     60}
     61
     62static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
    6363{
    6464    // t = qRound(x * alpha / 255.0)
    6565
    66     const int16x8_t t = vmulq_s16(x, alpha); // t
    67     return qvdiv_255_s16(t, half);
    68 }
    69 
    70 static inline int16x8_t qvinterpolate_pixel_255(int16x8_t x, int16x8_t a, int16x8_t y, int16x8_t b, int16x8_t half)
     66    const uint16x8_t t = vmulq_u16(x, alpha); // t
     67    return qvdiv_255_u16(t, half);
     68}
     69
     70static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
    7171{
    7272    // t = x * a + y * b
    7373
    74     const int16x8_t ta = vmulq_s16(x, a);
    75     const int16x8_t tb = vmulq_s16(y, b);
    76 
    77     return qvdiv_255_s16(vaddq_s16(ta, tb), half);
    78 }
    79 
    80 static inline int16x8_t qvsource_over_s16(int16x8_t src16, int16x8_t dst16, int16x8_t half, int16x8_t full)
    81 {
    82     const int16x4_t alpha16_high = vdup_lane_s16(vget_high_s16(src16), 3);
    83     const int16x4_t alpha16_low = vdup_lane_s16(vget_low_s16(src16), 3);
    84 
    85     const int16x8_t alpha16 = vsubq_s16(full, vcombine_s16(alpha16_low, alpha16_high));
    86 
    87     return vaddq_s16(src16, qvbyte_mul_s16(dst16, alpha16, half));
     74    const uint16x8_t ta = vmulq_u16(x, a);
     75    const uint16x8_t tb = vmulq_u16(y, b);
     76
     77    return qvdiv_255_u16(vaddq_u16(ta, tb), half);
     78}
     79
     80static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
     81{
     82    const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
     83    const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
     84
     85    const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
     86
     87    return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
    8888}
    8989
     
    9595    const uint *src = (const uint *) srcPixels;
    9696    uint *dst = (uint *) destPixels;
    97     int16x8_t half = vdupq_n_s16(0x80);
    98     int16x8_t full = vdupq_n_s16(0xff);
     97    uint16x8_t half = vdupq_n_u16(0x80);
     98    uint16x8_t full = vdupq_n_u16(0xff);
    9999    if (const_alpha == 256) {
    100100        for (int y = 0; y < h; ++y) {
    101101            int x = 0;
    102102            for (; x < w-3; x += 4) {
    103                 int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
     103                uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
    104104                if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
    105105                    // all opaque
    106                     vst1q_s32((int32_t *)&dst[x], src32);
     106                    vst1q_u32((uint32_t *)&dst[x], src32);
    107107                } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
    108                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
    109 
    110                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
    111                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
     108                    uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
     109
     110                    const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
     111                    const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
    112112
    113113                    const uint8x8_t src8_low = vget_low_u8(src8);
     
    117117                    const uint8x8_t dst8_high = vget_high_u8(dst8);
    118118
    119                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
    120                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
    121 
    122                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
    123                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
    124 
    125                     const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full);
    126                     const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full);
    127 
    128                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
    129                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
    130 
    131                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
     119                    const uint16x8_t src16_low = vmovl_u8(src8_low);
     120                    const uint16x8_t dst16_low = vmovl_u8(dst8_low);
     121
     122                    const uint16x8_t src16_high = vmovl_u8(src8_high);
     123                    const uint16x8_t dst16_high = vmovl_u8(dst8_high);
     124
     125                    const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full);
     126                    const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full);
     127
     128                    const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
     129                    const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
     130
     131                    vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
    132132                }
    133133            }
     
    144144    } else if (const_alpha != 0) {
    145145        const_alpha = (const_alpha * 255) >> 8;
    146         int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
     146        uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
    147147        for (int y = 0; y < h; ++y) {
    148148            int x = 0;
    149149            for (; x < w-3; x += 4) {
    150150                if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
    151                     int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
    152                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
    153 
    154                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
    155                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
     151                    uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
     152                    uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
     153
     154                    const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
     155                    const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
    156156
    157157                    const uint8x8_t src8_low = vget_low_u8(src8);
     
    161161                    const uint8x8_t dst8_high = vget_high_u8(dst8);
    162162
    163                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
    164                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
    165 
    166                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
    167                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
    168 
    169                     const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half);
    170                     const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half);
    171 
    172                     const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full);
    173                     const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full);
    174 
    175                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
    176                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
    177 
    178                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
     163                    const uint16x8_t src16_low = vmovl_u8(src8_low);
     164                    const uint16x8_t dst16_low = vmovl_u8(dst8_low);
     165
     166                    const uint16x8_t src16_high = vmovl_u8(src8_high);
     167                    const uint16x8_t dst16_high = vmovl_u8(dst8_high);
     168
     169                    const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
     170                    const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
     171
     172                    const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
     173                    const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
     174
     175                    const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
     176                    const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
     177
     178                    vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
    179179                }
    180180            }
     
    207207            const uint *src = (const uint *) srcPixels;
    208208            uint *dst = (uint *) destPixels;
    209             int16x8_t half = vdupq_n_s16(0x80);
     209            uint16x8_t half = vdupq_n_u16(0x80);
    210210            const_alpha = (const_alpha * 255) >> 8;
    211211            int one_minus_const_alpha = 255 - const_alpha;
    212             int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
    213             int16x8_t one_minus_const_alpha16 = vdupq_n_s16(255 - const_alpha);
     212            uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
     213            uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
    214214            for (int y = 0; y < h; ++y) {
    215215                int x = 0;
    216216                for (; x < w-3; x += 4) {
    217                     int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
    218                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
    219 
    220                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
    221                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
     217                    uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
     218                    uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
     219
     220                    const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
     221                    const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
    222222
    223223                    const uint8x8_t src8_low = vget_low_u8(src8);
     
    227227                    const uint8x8_t dst8_high = vget_high_u8(dst8);
    228228
    229                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
    230                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
    231 
    232                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
    233                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
    234 
    235                     const int16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
    236                     const int16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
    237 
    238                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
    239                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
    240 
    241                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
     229                    const uint16x8_t src16_low = vmovl_u8(src8_low);
     230                    const uint16x8_t dst16_low = vmovl_u8(dst8_low);
     231
     232                    const uint16x8_t src16_high = vmovl_u8(src8_high);
     233                    const uint16x8_t dst16_high = vmovl_u8(dst8_high);
     234
     235                    const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
     236                    const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
     237
     238                    const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
     239                    const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
     240
     241                    vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
    242242                }
    243243                for (; x<w; ++x) {
Note: See TracChangeset for help on using the changeset viewer.