source: trunk/src/gui/painting/qdrawingprimitive_sse2_p.h

Last change on this file was 846, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 10.2 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the QtGui module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#ifndef QDRAWINGPRIMITIVE_SSE2_P_H
43#define QDRAWINGPRIMITIVE_SSE2_P_H
44
45#include <private/qsimd_p.h>
46
47#ifdef QT_HAVE_SSE2
48
49//
50// W A R N I N G
51// -------------
52//
53// This file is not part of the Qt API. It exists purely as an
54// implementation detail. This header file may change from version to
55// version without notice, or even be removed.
56//
57// We mean it.
58//
59
60QT_BEGIN_NAMESPACE
61
62/*
63 * Multiply the components of pixelVector by alphaChannel
64 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
65 * colorMask must have 0x00ff00ff on each 32 bits component
66 * half must have the value 128 (0x80) for each 32 bits compnent
67 */
68#define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \
69{ \
70 /* 1. separate the colors in 2 vectors so each color is on 16 bits \
71 (in order to be multiplied by the alpha \
72 each 32 bit of dstVectorAG are in the form 0x00AA00GG \
73 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\
74 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \
75 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \
76 \
77 /* 2. multiply the vectors by the alpha channel */\
78 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \
79 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \
80 \
81 /* 3. devide by 255, that's the tricky part. \
82 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \
83 /** so first (X + X/256 + rounding) */\
84 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \
85 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \
86 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \
87 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \
88 \
89 /** second devide by 256 */\
90 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \
91 /** for AG, we could >> 8 to divide followed by << 8 to put the \
92 bytes in the correct position. By masking instead, we execute \
93 only one instruction */\
94 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \
95 \
96 /* 4. combine the 2 pairs of colors */ \
97 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \
98}
99
100/*
101 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
102 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
103 * colorMask must have 0x00ff00ff on each 32 bits component
104 * half must have the value 128 (0x80) for each 32 bits compnent
105 */
106#define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \
107 /* interpolate AG */\
108 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \
109 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \
110 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \
111 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \
112 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \
113 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \
114 finalAG = _mm_add_epi16(finalAG, half); \
115 finalAG = _mm_andnot_si128(colorMask, finalAG); \
116 \
117 /* interpolate RB */\
118 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \
119 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \
120 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \
121 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \
122 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \
123 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \
124 finalRB = _mm_add_epi16(finalRB, half); \
125 finalRB = _mm_srli_epi16(finalRB, 8); \
126 \
127 /* combine */\
128 result = _mm_or_si128(finalAG, finalRB); \
129}
130
131// Basically blend src over dst with the const alpha defined as constAlphaVector.
132// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
133//const __m128i nullVector = _mm_set1_epi32(0);
134//const __m128i half = _mm_set1_epi16(0x80);
135//const __m128i one = _mm_set1_epi16(0xff);
136//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
137//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
138//
139// The computation being done is:
140// result = s + d * (1-alpha)
141// with shortcuts if fully opaque or fully transparent.
142#define BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
143 int x = 0; \
144\
145 /* First, get dst aligned. */ \
146 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \
147 uint s = src[x]; \
148 if (s >= 0xff000000) \
149 dst[x] = s; \
150 else if (s != 0) \
151 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
152 } \
153\
154 for (; x < length-3; x += 4) { \
155 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
156 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
157 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
158 /* all opaque */ \
159 _mm_store_si128((__m128i *)&dst[x], srcVector); \
160 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
161 /* not fully transparent */ \
162 /* extract the alpha channel on 2 x 16 bits */ \
163 /* so we have room for the multiplication */ \
164 /* each 32 bits will be in the form 0x00AA00AA */ \
165 /* with A being the 1 - alpha */ \
166 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
167 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
168 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
169 \
170 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
171 __m128i destMultipliedByOneMinusAlpha; \
172 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
173 \
174 /* result = s + d * (1-alpha) */\
175 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
176 _mm_store_si128((__m128i *)&dst[x], result); \
177 } \
178 } \
179 for (; x < length; ++x) { \
180 uint s = src[x]; \
181 if (s >= 0xff000000) \
182 dst[x] = s; \
183 else if (s != 0) \
184 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
185 } \
186}
187
188// Basically blend src over dst with the const alpha defined as constAlphaVector.
189// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
190//const __m128i nullVector = _mm_set1_epi32(0);
191//const __m128i half = _mm_set1_epi16(0x80);
192//const __m128i one = _mm_set1_epi16(0xff);
193//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
194//
195// The computation being done is:
196// dest = (s + d * sia) * ca + d * cia
197// = s * ca + d * (sia * ca + cia)
198// = s * ca + d * (1 - sa*ca)
199#define BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector) \
200{ \
201 int x = 0; \
202\
203 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \
204 quint32 s = src[x]; \
205 if (s != 0) { \
206 s = BYTE_MUL(s, const_alpha); \
207 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
208 } \
209 } \
210\
211 for (; x < length-3; x += 4) { \
212 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
213 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { \
214 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); \
215\
216 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
217 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
218 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
219 \
220 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
221 __m128i destMultipliedByOneMinusAlpha; \
222 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
223 \
224 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
225 _mm_store_si128((__m128i *)&dst[x], result); \
226 } \
227 } \
228 for (; x < length; ++x) { \
229 quint32 s = src[x]; \
230 if (s != 0) { \
231 s = BYTE_MUL(s, const_alpha); \
232 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
233 } \
234 } \
235}
236
237QT_END_NAMESPACE
238
239#endif // QT_HAVE_SSE2
240
241#endif // QDRAWINGPRIMITIVE_SSE2_P_H
Note: See TracBrowser for help on using the repository browser.