Changeset 846 for trunk/src/gui/painting/qdrawhelper_sse2.cpp
- Timestamp:
- May 5, 2011, 5:36:53 AM (14 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk
- Property svn:mergeinfo changed
/branches/vendor/nokia/qt/4.7.2 (added) merged: 845 /branches/vendor/nokia/qt/current merged: 844 /branches/vendor/nokia/qt/4.6.3 removed
- Property svn:mergeinfo changed
-
trunk/src/gui/painting/qdrawhelper_sse2.cpp
r769 r846 1 1 /**************************************************************************** 2 2 ** 3 ** Copyright (C) 201 0Nokia Corporation and/or its subsidiary(-ies).3 ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). 4 4 ** All rights reserved. 5 5 ** Contact: Nokia Corporation (qt-info@nokia.com) … … 44 44 #ifdef QT_HAVE_SSE2 45 45 46 #include <private/qdrawingprimitive_sse2_p.h> 46 47 #include <private/qpaintengine_raster_p.h> 47 48 48 #ifdef QT_LINUXBASE49 // this is an evil hack - the posix_memalign declaration in LSB50 // is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=243151 # define posix_memalign _lsb_hack_posix_memalign52 # include <emmintrin.h>53 # undef posix_memalign54 #else55 # include <emmintrin.h>56 #endif57 58 49 QT_BEGIN_NAMESPACE 59 60 /*61 * Multiply the components of pixelVector by alphaChannel62 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA63 * colorMask must have 0x00ff00ff on each 32 bits component64 * half must have the value 128 (0x80) for each 32 bits compnent65 */66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \67 { \68 /* 1. separate the colors in 2 vectors so each color is on 16 bits \69 (in order to be multiplied by the alpha \70 each 32 bit of dstVectorAG are in the form 0x00AA00GG \71 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\72 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \73 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \74 \75 /* 2. multiply the vectors by the alpha channel */\76 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \77 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \78 \79 /* 3. devide by 255, that's the tricky part. \80 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \81 /** so first (X + X/256 + rounding) */\82 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \83 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \84 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \85 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \86 \87 /** second devide by 256 */\88 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \89 /** for AG, we could >> 8 to divide followed by << 8 to put the \90 bytes in the correct position. By masking instead, we execute \91 only one instruction */\92 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \93 \94 /* 4. combine the 2 pairs of colors */ \95 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \96 }97 98 /*99 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA100 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component101 * colorMask must have 0x00ff00ff on each 32 bits component102 * half must have the value 128 (0x80) for each 32 bits compnent103 */104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \105 /* interpolate AG */\106 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \107 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \108 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \109 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \110 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \111 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \112 finalAG = _mm_add_epi16(finalAG, half); \113 finalAG = _mm_andnot_si128(colorMask, finalAG); \114 \115 /* interpolate RB */\116 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \117 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \118 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \119 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \120 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \121 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \122 finalRB = _mm_add_epi16(finalRB, half); \123 finalRB = _mm_srli_epi16(finalRB, 8); \124 \125 /* combine */\126 result = _mm_or_si128(finalAG, finalRB); \127 }128 50 129 51 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, … … 133 55 { 134 56 const quint32 *src = (const quint32 *) srcPixels; 135 quint32 *dst = ( uint*) destPixels;57 quint32 *dst = (quint32 *) destPixels; 136 58 if (const_alpha == 256) { 137 59 const __m128i alphaMask = _mm_set1_epi32(0xff000000); … … 141 63 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 142 64 for (int y = 0; y < h; ++y) { 143 int x = 0; 144 for (; x < w-3; x += 4) { 145 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 146 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); 147 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { 148 // all opaque 149 _mm_storeu_si128((__m128i *)&dst[x], srcVector); 150 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { 151 // not fully transparent 152 // result = s + d * (1-alpha) 153 154 // extract the alpha channel on 2 x 16 bits 155 // so we have room for the multiplication 156 // each 32 bits will be in the form 0x00AA00AA 157 // with A being the 1 - alpha 158 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 159 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 160 alphaChannel = _mm_sub_epi16(one, alphaChannel); 161 162 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 163 __m128i destMultipliedByOneMinusAlpha; 164 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 165 166 // result = s + d * (1-alpha) 167 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 168 _mm_storeu_si128((__m128i *)&dst[x], result); 169 } 170 } 171 for (; x<w; ++x) { 172 uint s = src[x]; 173 if (s >= 0xff000000) 174 dst[x] = s; 175 else if (s != 0) 176 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 177 } 65 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask); 178 66 dst = (quint32 *)(((uchar *) dst) + dbpl); 179 67 src = (const quint32 *)(((const uchar *) src) + sbpl); … … 190 78 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 191 79 for (int y = 0; y < h; ++y) { 192 int x = 0; 193 for (; x < w-3; x += 4) { 194 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 195 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 196 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); 197 198 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 199 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 200 alphaChannel = _mm_sub_epi16(one, alphaChannel); 201 202 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 203 __m128i destMultipliedByOneMinusAlpha; 204 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 205 206 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 207 _mm_storeu_si128((__m128i *)&dst[x], result); 208 } 209 } 210 for (; x<w; ++x) { 211 quint32 s = src[x]; 212 if (s != 0) { 213 s = BYTE_MUL(s, const_alpha); 214 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 215 } 216 } 80 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) 217 81 dst = (quint32 *)(((uchar *) dst) + dbpl); 218 82 src = (const quint32 *)(((const uchar *) src) + sbpl); … … 233 97 { 234 98 const quint32 *src = (const quint32 *) srcPixels; 235 quint32 *dst = ( uint*) destPixels;99 quint32 *dst = (quint32 *) destPixels; 236 100 if (const_alpha != 256) { 237 101 if (const_alpha != 0) { … … 246 110 for (int y = 0; y < h; ++y) { 247 111 int x = 0; 112 113 // First, align dest to 16 bytes: 114 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { 115 quint32 s = src[x]; 116 s = BYTE_MUL(s, const_alpha); 117 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); 118 } 119 248 120 for (; x < w-3; x += 4) { 249 121 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 250 122 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 251 const __m128i dstVector = _mm_load u_si128((__m128i *)&dst[x]);123 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 252 124 __m128i result; 253 125 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); 254 _mm_store u_si128((__m128i *)&dst[x], result);126 _mm_store_si128((__m128i *)&dst[x], result); 255 127 } 256 128 } … … 269 141 } 270 142 143 void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) 144 { 145 Q_ASSERT(const_alpha < 256); 146 147 const quint32 *src = (const quint32 *) srcPixels; 148 quint32 *dst = (quint32 *) destPixels; 149 150 const __m128i nullVector = _mm_set1_epi32(0); 151 const __m128i half = _mm_set1_epi16(0x80); 152 const __m128i one = _mm_set1_epi16(0xff); 153 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 154 if (const_alpha == 255) { 155 const __m128i alphaMask = _mm_set1_epi32(0xff000000); 156 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask); 157 } else { 158 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 159 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector); 160 } 161 } 162 163 void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha) 164 { 165 int x = 0; 166 167 if (const_alpha == 255) { 168 // 1) Prologue: align destination on 16 bytes 169 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) 170 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); 171 172 // 2) composition with SSE2 173 for (; x < length - 3; x += 4) { 174 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 175 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 176 177 const __m128i result = _mm_adds_epu8(srcVector, dstVector); 178 _mm_store_si128((__m128i *)&dst[x], result); 179 } 180 181 // 3) Epilogue: 182 for (; x < length; ++x) 183 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); 184 } else { 185 const int one_minus_const_alpha = 255 - const_alpha; 186 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 187 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); 188 189 // 1) Prologue: align destination on 16 bytes 190 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) 191 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); 192 193 const __m128i half = _mm_set1_epi16(0x80); 194 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 195 // 2) composition with SSE2 196 for (; x < length - 3; x += 4) { 197 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 198 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 199 200 __m128i result = _mm_adds_epu8(srcVector, dstVector); 201 INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) 202 _mm_store_si128((__m128i *)&dst[x], result); 203 } 204 205 // 3) Epilogue: 206 for (; x < length; ++x) 207 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); 208 } 209 } 210 211 void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha) 212 { 213 if (const_alpha == 255) { 214 ::memcpy(dst, src, length * sizeof(uint)); 215 } else { 216 const int ialpha = 255 - const_alpha; 217 218 int x = 0; 219 220 // 1) prologue, align on 16 bytes 221 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) 222 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); 223 224 // 2) interpolate pixels with SSE2 225 const __m128i half = _mm_set1_epi16(0x80); 226 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 227 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 228 const __m128i oneMinusConstAlpha = _mm_set1_epi16(ialpha); 229 for (; x < length - 3; x += 4) { 230 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 231 __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 232 INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) 233 _mm_store_si128((__m128i *)&dst[x], dstVector); 234 } 235 236 // 3) Epilogue 237 for (; x < length; ++x) 238 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); 239 } 240 } 241 271 242 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) 272 243 { … … 296 267 int n = (count128 + 3) / 4; 297 268 switch (count128 & 0x3) { 298 case 0: do { _mm_st ore_si128(dst128++, value128);299 case 3: _mm_st ore_si128(dst128++, value128);300 case 2: _mm_st ore_si128(dst128++, value128);301 case 1: _mm_st ore_si128(dst128++, value128);269 case 0: do { _mm_stream_si128(dst128++, value128); 270 case 3: _mm_stream_si128(dst128++, value128); 271 case 2: _mm_stream_si128(dst128++, value128); 272 case 1: _mm_stream_si128(dst128++, value128); 302 273 } while (--n > 0); 303 274 } … … 312 283 } 313 284 } 285 286 void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha) 287 { 288 if ((const_alpha & qAlpha(color)) == 255) { 289 qt_memfill32_sse2(destPixels, color, length); 290 } else { 291 if (const_alpha != 255) 292 color = BYTE_MUL(color, const_alpha); 293 294 const quint32 minusAlphaOfColor = qAlpha(~color); 295 int x = 0; 296 297 quint32 *dst = (quint32 *) destPixels; 298 const __m128i colorVector = _mm_set1_epi32(color); 299 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 300 const __m128i half = _mm_set1_epi16(0x80); 301 const __m128i minusAlphaOfColorVector = _mm_set1_epi16(minusAlphaOfColor); 302 303 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) 304 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); 305 306 for (; x < length-3; x += 4) { 307 __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 308 BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half); 309 dstVector = _mm_add_epi8(colorVector, dstVector); 310 _mm_store_si128((__m128i *)&dst[x], dstVector); 311 } 312 for (;x < length; ++x) 313 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); 314 } 315 } 316 317 CompositionFunctionSolid qt_functionForModeSolid_onlySSE2[numCompositionFunctions] = { 318 comp_func_solid_SourceOver_sse2, 319 comp_func_solid_DestinationOver, 320 comp_func_solid_Clear, 321 comp_func_solid_Source, 322 comp_func_solid_Destination, 323 comp_func_solid_SourceIn, 324 comp_func_solid_DestinationIn, 325 comp_func_solid_SourceOut, 326 comp_func_solid_DestinationOut, 327 comp_func_solid_SourceAtop, 328 comp_func_solid_DestinationAtop, 329 comp_func_solid_XOR, 330 comp_func_solid_Plus, 331 comp_func_solid_Multiply, 332 comp_func_solid_Screen, 333 comp_func_solid_Overlay, 334 comp_func_solid_Darken, 335 comp_func_solid_Lighten, 336 comp_func_solid_ColorDodge, 337 comp_func_solid_ColorBurn, 338 comp_func_solid_HardLight, 339 comp_func_solid_SoftLight, 340 comp_func_solid_Difference, 341 comp_func_solid_Exclusion, 342 rasterop_solid_SourceOrDestination, 343 rasterop_solid_SourceAndDestination, 344 rasterop_solid_SourceXorDestination, 345 rasterop_solid_NotSourceAndNotDestination, 346 rasterop_solid_NotSourceOrNotDestination, 347 rasterop_solid_NotSourceXorDestination, 348 rasterop_solid_NotSource, 349 rasterop_solid_NotSourceAndDestination, 350 rasterop_solid_SourceAndNotDestination 351 }; 352 353 CompositionFunction qt_functionForMode_onlySSE2[numCompositionFunctions] = { 354 comp_func_SourceOver_sse2, 355 comp_func_DestinationOver, 356 comp_func_Clear, 357 comp_func_Source_sse2, 358 comp_func_Destination, 359 comp_func_SourceIn, 360 comp_func_DestinationIn, 361 comp_func_SourceOut, 362 comp_func_DestinationOut, 363 comp_func_SourceAtop, 364 comp_func_DestinationAtop, 365 comp_func_XOR, 366 comp_func_Plus_sse2, 367 comp_func_Multiply, 368 comp_func_Screen, 369 comp_func_Overlay, 370 comp_func_Darken, 371 comp_func_Lighten, 372 comp_func_ColorDodge, 373 comp_func_ColorBurn, 374 comp_func_HardLight, 375 comp_func_SoftLight, 376 comp_func_Difference, 377 comp_func_Exclusion, 378 rasterop_SourceOrDestination, 379 rasterop_SourceAndDestination, 380 rasterop_SourceXorDestination, 381 rasterop_NotSourceAndNotDestination, 382 rasterop_NotSourceOrNotDestination, 383 rasterop_NotSourceXorDestination, 384 rasterop_NotSource, 385 rasterop_NotSourceAndDestination, 386 rasterop_SourceAndNotDestination 387 }; 314 388 315 389 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
Note:
See TracChangeset
for help on using the changeset viewer.