Ignore:
Timestamp:
May 5, 2011, 5:36:53 AM (14 years ago)
Author:
Dmitry A. Kuminov
Message:

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk

  • trunk/src/gui/painting/qdrawhelper_sse2.cpp

    r769 r846  
    11/****************************************************************************
    22**
    3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
     3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
    44** All rights reserved.
    55** Contact: Nokia Corporation (qt-info@nokia.com)
     
    4444#ifdef QT_HAVE_SSE2
    4545
     46#include <private/qdrawingprimitive_sse2_p.h>
    4647#include <private/qpaintengine_raster_p.h>
    4748
    48 #ifdef QT_LINUXBASE
    49 // this is an evil hack - the posix_memalign declaration in LSB
    50 // is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431
    51 #  define posix_memalign _lsb_hack_posix_memalign
    52 #  include <emmintrin.h>
    53 #  undef posix_memalign
    54 #else
    55 #  include <emmintrin.h>
    56 #endif
    57 
    5849QT_BEGIN_NAMESPACE
    59 
    60 /*
    61  * Multiply the components of pixelVector by alphaChannel
    62  * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
    63  * colorMask must have 0x00ff00ff on each 32 bits component
    64  * half must have the value 128 (0x80) for each 32 bits compnent
    65  */
    66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \
    67 { \
    68     /* 1. separate the colors in 2 vectors so each color is on 16 bits \
    69        (in order to be multiplied by the alpha \
    70        each 32 bit of dstVectorAG are in the form 0x00AA00GG \
    71        each 32 bit of dstVectorRB are in the form 0x00RR00BB */\
    72     __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \
    73     __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \
    74  \
    75     /* 2. multiply the vectors by the alpha channel */\
    76     pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \
    77     pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \
    78  \
    79     /* 3. devide by 255, that's the tricky part. \
    80        we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \
    81     /** so first (X + X/256 + rounding) */\
    82     pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \
    83     pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \
    84     pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \
    85     pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \
    86  \
    87     /** second devide by 256 */\
    88     pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \
    89     /** for AG, we could >> 8 to divide followed by << 8 to put the \
    90         bytes in the correct position. By masking instead, we execute \
    91         only one instruction */\
    92     pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \
    93  \
    94     /* 4. combine the 2 pairs of colors */ \
    95     result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \
    96 }
    97 
    98 /*
    99  * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
    100  * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
    101  * colorMask must have 0x00ff00ff on each 32 bits component
    102  * half must have the value 128 (0x80) for each 32 bits compnent
    103  */
    104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \
    105     /* interpolate AG */\
    106     __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \
    107     __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \
    108     __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \
    109     __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \
    110     __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \
    111     finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \
    112     finalAG = _mm_add_epi16(finalAG, half); \
    113     finalAG = _mm_andnot_si128(colorMask, finalAG); \
    114  \
    115     /* interpolate RB */\
    116     __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \
    117     __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \
    118     __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \
    119     __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \
    120     __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \
    121     finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \
    122     finalRB = _mm_add_epi16(finalRB, half); \
    123     finalRB = _mm_srli_epi16(finalRB, 8); \
    124  \
    125     /* combine */\
    126     result = _mm_or_si128(finalAG, finalRB); \
    127 }
    12850
    12951void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
     
    13355{
    13456    const quint32 *src = (const quint32 *) srcPixels;
    135     quint32 *dst = (uint *) destPixels;
     57    quint32 *dst = (quint32 *) destPixels;
    13658    if (const_alpha == 256) {
    13759        const __m128i alphaMask = _mm_set1_epi32(0xff000000);
     
    14163        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
    14264        for (int y = 0; y < h; ++y) {
    143             int x = 0;
    144             for (; x < w-3; x += 4) {
    145                 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    146                 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
    147                 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
    148                     // all opaque
    149                     _mm_storeu_si128((__m128i *)&dst[x], srcVector);
    150                 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
    151                     // not fully transparent
    152                     // result = s + d * (1-alpha)
    153 
    154                     // extract the alpha channel on 2 x 16 bits
    155                     // so we have room for the multiplication
    156                     // each 32 bits will be in the form 0x00AA00AA
    157                     // with A being the 1 - alpha
    158                     __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
    159                     alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
    160                     alphaChannel = _mm_sub_epi16(one, alphaChannel);
    161 
    162                     const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
    163                     __m128i destMultipliedByOneMinusAlpha;
    164                     BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
    165 
    166                     // result = s + d * (1-alpha)
    167                     const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
    168                     _mm_storeu_si128((__m128i *)&dst[x], result);
    169                 }
    170             }
    171             for (; x<w; ++x) {
    172                 uint s = src[x];
    173                 if (s >= 0xff000000)
    174                     dst[x] = s;
    175                 else if (s != 0)
    176                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
    177             }
     65            BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask);
    17866            dst = (quint32 *)(((uchar *) dst) + dbpl);
    17967            src = (const quint32 *)(((const uchar *) src) + sbpl);
     
    19078        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
    19179        for (int y = 0; y < h; ++y) {
    192             int x = 0;
    193             for (; x < w-3; x += 4) {
    194                 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    195                 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) {
    196                     BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half);
    197 
    198                     __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
    199                     alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
    200                     alphaChannel = _mm_sub_epi16(one, alphaChannel);
    201 
    202                     const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
    203                     __m128i destMultipliedByOneMinusAlpha;
    204                     BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
    205 
    206                     const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
    207                     _mm_storeu_si128((__m128i *)&dst[x], result);
    208                 }
    209             }
    210             for (; x<w; ++x) {
    211                 quint32 s = src[x];
    212                 if (s != 0) {
    213                     s = BYTE_MUL(s, const_alpha);
    214                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
    215                 }
    216             }
     80            BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
    21781            dst = (quint32 *)(((uchar *) dst) + dbpl);
    21882            src = (const quint32 *)(((const uchar *) src) + sbpl);
     
    23397{
    23498    const quint32 *src = (const quint32 *) srcPixels;
    235     quint32 *dst = (uint *) destPixels;
     99    quint32 *dst = (quint32 *) destPixels;
    236100    if (const_alpha != 256) {
    237101        if (const_alpha != 0) {
     
    246110            for (int y = 0; y < h; ++y) {
    247111                int x = 0;
     112
     113                // First, align dest to 16 bytes:
     114                ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
     115                    quint32 s = src[x];
     116                    s = BYTE_MUL(s, const_alpha);
     117                    dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
     118                }
     119
    248120                for (; x < w-3; x += 4) {
    249121                    __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    250122                    if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) {
    251                         const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
     123                        const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
    252124                        __m128i result;
    253125                        INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
    254                         _mm_storeu_si128((__m128i *)&dst[x], result);
     126                        _mm_store_si128((__m128i *)&dst[x], result);
    255127                    }
    256128                }
     
    269141}
    270142
     143void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
     144{
     145    Q_ASSERT(const_alpha < 256);
     146
     147    const quint32 *src = (const quint32 *) srcPixels;
     148    quint32 *dst = (quint32 *) destPixels;
     149
     150    const __m128i nullVector = _mm_set1_epi32(0);
     151    const __m128i half = _mm_set1_epi16(0x80);
     152    const __m128i one = _mm_set1_epi16(0xff);
     153    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
     154    if (const_alpha == 255) {
     155        const __m128i alphaMask = _mm_set1_epi32(0xff000000);
     156        BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask);
     157    } else {
     158        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
     159        BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector);
     160    }
     161}
     162
     163void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha)
     164{
     165    int x = 0;
     166
     167    if (const_alpha == 255) {
     168        // 1) Prologue: align destination on 16 bytes
     169        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
     170            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
     171
     172        // 2) composition with SSE2
     173        for (; x < length - 3; x += 4) {
     174            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
     175            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
     176
     177            const __m128i result = _mm_adds_epu8(srcVector, dstVector);
     178            _mm_store_si128((__m128i *)&dst[x], result);
     179        }
     180
     181        // 3) Epilogue:
     182        for (; x < length; ++x)
     183            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
     184    } else {
     185        const int one_minus_const_alpha = 255 - const_alpha;
     186        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
     187        const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
     188
     189        // 1) Prologue: align destination on 16 bytes
     190        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
     191            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
     192
     193        const __m128i half = _mm_set1_epi16(0x80);
     194        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
     195        // 2) composition with SSE2
     196        for (; x < length - 3; x += 4) {
     197            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
     198            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
     199
     200            __m128i result = _mm_adds_epu8(srcVector, dstVector);
     201            INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
     202            _mm_store_si128((__m128i *)&dst[x], result);
     203        }
     204
     205        // 3) Epilogue:
     206        for (; x < length; ++x)
     207            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
     208    }
     209}
     210
     211void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha)
     212{
     213    if (const_alpha == 255) {
     214        ::memcpy(dst, src, length * sizeof(uint));
     215    } else {
     216        const int ialpha = 255 - const_alpha;
     217
     218        int x = 0;
     219
     220        // 1) prologue, align on 16 bytes
     221        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
     222            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
     223
     224        // 2) interpolate pixels with SSE2
     225        const __m128i half = _mm_set1_epi16(0x80);
     226        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
     227        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
     228        const __m128i oneMinusConstAlpha =  _mm_set1_epi16(ialpha);
     229        for (; x < length - 3; x += 4) {
     230            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
     231            __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
     232            INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
     233            _mm_store_si128((__m128i *)&dst[x], dstVector);
     234        }
     235
     236        // 3) Epilogue
     237        for (; x < length; ++x)
     238            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
     239    }
     240}
     241
    271242void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
    272243{
     
    296267    int n = (count128 + 3) / 4;
    297268    switch (count128 & 0x3) {
    298     case 0: do { _mm_store_si128(dst128++, value128);
    299     case 3:      _mm_store_si128(dst128++, value128);
    300     case 2:      _mm_store_si128(dst128++, value128);
    301     case 1:      _mm_store_si128(dst128++, value128);
     269    case 0: do { _mm_stream_si128(dst128++, value128);
     270    case 3:      _mm_stream_si128(dst128++, value128);
     271    case 2:      _mm_stream_si128(dst128++, value128);
     272    case 1:      _mm_stream_si128(dst128++, value128);
    302273    } while (--n > 0);
    303274    }
     
    312283    }
    313284}
     285
     286void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha)
     287{
     288    if ((const_alpha & qAlpha(color)) == 255) {
     289        qt_memfill32_sse2(destPixels, color, length);
     290    } else {
     291        if (const_alpha != 255)
     292            color = BYTE_MUL(color, const_alpha);
     293
     294        const quint32 minusAlphaOfColor = qAlpha(~color);
     295        int x = 0;
     296
     297        quint32 *dst = (quint32 *) destPixels;
     298        const __m128i colorVector = _mm_set1_epi32(color);
     299        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
     300        const __m128i half = _mm_set1_epi16(0x80);
     301        const __m128i minusAlphaOfColorVector = _mm_set1_epi16(minusAlphaOfColor);
     302
     303        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
     304            destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
     305
     306        for (; x < length-3; x += 4) {
     307            __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
     308            BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half);
     309            dstVector = _mm_add_epi8(colorVector, dstVector);
     310            _mm_store_si128((__m128i *)&dst[x], dstVector);
     311        }
     312        for (;x < length; ++x)
     313            destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
     314    }
     315}
     316
     317CompositionFunctionSolid qt_functionForModeSolid_onlySSE2[numCompositionFunctions] = {
     318    comp_func_solid_SourceOver_sse2,
     319    comp_func_solid_DestinationOver,
     320    comp_func_solid_Clear,
     321    comp_func_solid_Source,
     322    comp_func_solid_Destination,
     323    comp_func_solid_SourceIn,
     324    comp_func_solid_DestinationIn,
     325    comp_func_solid_SourceOut,
     326    comp_func_solid_DestinationOut,
     327    comp_func_solid_SourceAtop,
     328    comp_func_solid_DestinationAtop,
     329    comp_func_solid_XOR,
     330    comp_func_solid_Plus,
     331    comp_func_solid_Multiply,
     332    comp_func_solid_Screen,
     333    comp_func_solid_Overlay,
     334    comp_func_solid_Darken,
     335    comp_func_solid_Lighten,
     336    comp_func_solid_ColorDodge,
     337    comp_func_solid_ColorBurn,
     338    comp_func_solid_HardLight,
     339    comp_func_solid_SoftLight,
     340    comp_func_solid_Difference,
     341    comp_func_solid_Exclusion,
     342    rasterop_solid_SourceOrDestination,
     343    rasterop_solid_SourceAndDestination,
     344    rasterop_solid_SourceXorDestination,
     345    rasterop_solid_NotSourceAndNotDestination,
     346    rasterop_solid_NotSourceOrNotDestination,
     347    rasterop_solid_NotSourceXorDestination,
     348    rasterop_solid_NotSource,
     349    rasterop_solid_NotSourceAndDestination,
     350    rasterop_solid_SourceAndNotDestination
     351};
     352
     353CompositionFunction qt_functionForMode_onlySSE2[numCompositionFunctions] = {
     354    comp_func_SourceOver_sse2,
     355    comp_func_DestinationOver,
     356    comp_func_Clear,
     357    comp_func_Source_sse2,
     358    comp_func_Destination,
     359    comp_func_SourceIn,
     360    comp_func_DestinationIn,
     361    comp_func_SourceOut,
     362    comp_func_DestinationOut,
     363    comp_func_SourceAtop,
     364    comp_func_DestinationAtop,
     365    comp_func_XOR,
     366    comp_func_Plus_sse2,
     367    comp_func_Multiply,
     368    comp_func_Screen,
     369    comp_func_Overlay,
     370    comp_func_Darken,
     371    comp_func_Lighten,
     372    comp_func_ColorDodge,
     373    comp_func_ColorBurn,
     374    comp_func_HardLight,
     375    comp_func_SoftLight,
     376    comp_func_Difference,
     377    comp_func_Exclusion,
     378    rasterop_SourceOrDestination,
     379    rasterop_SourceAndDestination,
     380    rasterop_SourceXorDestination,
     381    rasterop_NotSourceAndNotDestination,
     382    rasterop_NotSourceOrNotDestination,
     383    rasterop_NotSourceXorDestination,
     384    rasterop_NotSource,
     385    rasterop_NotSourceAndDestination,
     386    rasterop_SourceAndNotDestination
     387};
    314388
    315389void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
Note: See TracChangeset for help on using the changeset viewer.