Changeset 769 for trunk/src/gui/painting/qdrawhelper_sse2.cpp
- Timestamp:
- Aug 2, 2010, 9:27:30 PM (15 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk
- Property svn:mergeinfo changed
/branches/vendor/nokia/qt/4.6.3 (added) merged: 768 /branches/vendor/nokia/qt/current merged: 767 /branches/vendor/nokia/qt/4.6.2 removed
- Property svn:mergeinfo changed
-
trunk/src/gui/painting/qdrawhelper_sse2.cpp
r651 r769 57 57 58 58 QT_BEGIN_NAMESPACE 59 60 /* 61 * Multiply the components of pixelVector by alphaChannel 62 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA 63 * colorMask must have 0x00ff00ff on each 32 bits component 64 * half must have the value 128 (0x80) for each 32 bits compnent 65 */ 66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \ 67 { \ 68 /* 1. separate the colors in 2 vectors so each color is on 16 bits \ 69 (in order to be multiplied by the alpha \ 70 each 32 bit of dstVectorAG are in the form 0x00AA00GG \ 71 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\ 72 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \ 73 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \ 74 \ 75 /* 2. multiply the vectors by the alpha channel */\ 76 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \ 77 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \ 78 \ 79 /* 3. devide by 255, that's the tricky part. \ 80 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \ 81 /** so first (X + X/256 + rounding) */\ 82 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \ 83 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \ 84 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \ 85 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \ 86 \ 87 /** second devide by 256 */\ 88 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \ 89 /** for AG, we could >> 8 to divide followed by << 8 to put the \ 90 bytes in the correct position. By masking instead, we execute \ 91 only one instruction */\ 92 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \ 93 \ 94 /* 4. combine the 2 pairs of colors */ \ 95 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \ 96 } 97 98 /* 99 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA 100 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component 101 * colorMask must have 0x00ff00ff on each 32 bits component 102 * half must have the value 128 (0x80) for each 32 bits compnent 103 */ 104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \ 105 /* interpolate AG */\ 106 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \ 107 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \ 108 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \ 109 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \ 110 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \ 111 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \ 112 finalAG = _mm_add_epi16(finalAG, half); \ 113 finalAG = _mm_andnot_si128(colorMask, finalAG); \ 114 \ 115 /* interpolate RB */\ 116 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \ 117 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \ 118 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \ 119 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \ 120 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \ 121 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \ 122 finalRB = _mm_add_epi16(finalRB, half); \ 123 finalRB = _mm_srli_epi16(finalRB, 8); \ 124 \ 125 /* combine */\ 126 result = _mm_or_si128(finalAG, finalRB); \ 127 } 128 129 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, 130 const uchar *srcPixels, int sbpl, 131 int w, int h, 132 int const_alpha) 133 { 134 const quint32 *src = (const quint32 *) srcPixels; 135 quint32 *dst = (uint *) destPixels; 136 if (const_alpha == 256) { 137 const __m128i alphaMask = _mm_set1_epi32(0xff000000); 138 const __m128i nullVector = _mm_set1_epi32(0); 139 const __m128i half = _mm_set1_epi16(0x80); 140 const __m128i one = _mm_set1_epi16(0xff); 141 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 142 for (int y = 0; y < h; ++y) { 143 int x = 0; 144 for (; x < w-3; x += 4) { 145 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 146 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); 147 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { 148 // all opaque 149 _mm_storeu_si128((__m128i *)&dst[x], srcVector); 150 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { 151 // not fully transparent 152 // result = s + d * (1-alpha) 153 154 // extract the alpha channel on 2 x 16 bits 155 // so we have room for the multiplication 156 // each 32 bits will be in the form 0x00AA00AA 157 // with A being the 1 - alpha 158 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 159 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 160 alphaChannel = _mm_sub_epi16(one, alphaChannel); 161 162 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 163 __m128i destMultipliedByOneMinusAlpha; 164 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 165 166 // result = s + d * (1-alpha) 167 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 168 _mm_storeu_si128((__m128i *)&dst[x], result); 169 } 170 } 171 for (; x<w; ++x) { 172 uint s = src[x]; 173 if (s >= 0xff000000) 174 dst[x] = s; 175 else if (s != 0) 176 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 177 } 178 dst = (quint32 *)(((uchar *) dst) + dbpl); 179 src = (const quint32 *)(((const uchar *) src) + sbpl); 180 } 181 } else if (const_alpha != 0) { 182 // dest = (s + d * sia) * ca + d * cia 183 // = s * ca + d * (sia * ca + cia) 184 // = s * ca + d * (1 - sa*ca) 185 const_alpha = (const_alpha * 255) >> 8; 186 const __m128i nullVector = _mm_set1_epi32(0); 187 const __m128i half = _mm_set1_epi16(0x80); 188 const __m128i one = _mm_set1_epi16(0xff); 189 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 190 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 191 for (int y = 0; y < h; ++y) { 192 int x = 0; 193 for (; x < w-3; x += 4) { 194 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 195 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 196 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); 197 198 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 199 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 200 alphaChannel = _mm_sub_epi16(one, alphaChannel); 201 202 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 203 __m128i destMultipliedByOneMinusAlpha; 204 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 205 206 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 207 _mm_storeu_si128((__m128i *)&dst[x], result); 208 } 209 } 210 for (; x<w; ++x) { 211 quint32 s = src[x]; 212 if (s != 0) { 213 s = BYTE_MUL(s, const_alpha); 214 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 215 } 216 } 217 dst = (quint32 *)(((uchar *) dst) + dbpl); 218 src = (const quint32 *)(((const uchar *) src) + sbpl); 219 } 220 } 221 } 222 223 // qblendfunctions.cpp 224 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, 225 const uchar *srcPixels, int sbpl, 226 int w, int h, 227 int const_alpha); 228 229 void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl, 230 const uchar *srcPixels, int sbpl, 231 int w, int h, 232 int const_alpha) 233 { 234 const quint32 *src = (const quint32 *) srcPixels; 235 quint32 *dst = (uint *) destPixels; 236 if (const_alpha != 256) { 237 if (const_alpha != 0) { 238 const __m128i nullVector = _mm_set1_epi32(0); 239 const __m128i half = _mm_set1_epi16(0x80); 240 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 241 242 const_alpha = (const_alpha * 255) >> 8; 243 int one_minus_const_alpha = 255 - const_alpha; 244 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 245 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); 246 for (int y = 0; y < h; ++y) { 247 int x = 0; 248 for (; x < w-3; x += 4) { 249 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 250 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 251 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 252 __m128i result; 253 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); 254 _mm_storeu_si128((__m128i *)&dst[x], result); 255 } 256 } 257 for (; x<w; ++x) { 258 quint32 s = src[x]; 259 s = BYTE_MUL(s, const_alpha); 260 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); 261 } 262 dst = (quint32 *)(((uchar *) dst) + dbpl); 263 src = (const quint32 *)(((const uchar *) src) + sbpl); 264 } 265 } 266 } else { 267 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); 268 } 269 } 59 270 60 271 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
Note:
See TracChangeset
for help on using the changeset viewer.