source: trunk/src/3rdparty/pixman/pixman-arm-neon-asm.S@ 908

Last change on this file since 908 was 846, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 61.3 KB
Line 
1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits
40#endif
41
42 .text
43 .fpu neon
44 .arch armv7a
45 .altmacro
46
47#include "pixman-arm-neon-asm.h"
48
49/* Global configuration options and preferences */
50
51/*
52 * The code can optionally make use of unaligned memory accesses to improve
53 * performance of handling leading/trailing pixels for each scanline.
54 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
55 * example in linux if unaligned memory accesses are not configured to
56 * generate.exceptions.
57 */
58.set RESPECT_STRICT_ALIGNMENT, 1
59
60/*
61 * Set default prefetch type. There is a choice between the following options:
62 *
63 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
64 * as NOP to workaround some HW bugs or for whatever other reason)
65 *
66 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
67 * advanced prefetch intruduces heavy overhead)
68 *
69 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
70 * which can run ARM and NEON instructions simultaneously so that extra ARM
71 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
72 *
73 * Note: some types of function can't support advanced prefetch and fallback
74 * to simple one (those which handle 24bpp pixels)
75 */
76.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
77
78/* Prefetch distance in pixels for simple prefetch */
79.set PREFETCH_DISTANCE_SIMPLE, 64
80
81/*
82 * Implementation of pixman_composite_over_8888_0565_asm_neon
83 *
84 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
85 * performs OVER compositing operation. Function fast_composite_over_8888_0565
86 * from pixman-fast-path.c does the same in C and can be used as a reference.
87 *
88 * First we need to have some NEON assembly code which can do the actual
89 * operation on the pixels and provide it to the template macro.
90 *
91 * Template macro quite conveniently takes care of emitting all the necessary
92 * code for memory reading and writing (including quite tricky cases of
93 * handling unaligned leading/trailing pixels), so we only need to deal with
94 * the data in NEON registers.
95 *
96 * NEON registers allocation in general is recommented to be the following:
97 * d0, d1, d2, d3 - contain loaded source pixel data
98 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
99 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
100 * d28, d29, d30, d31 - place for storing the result (destination pixels)
101 *
102 * As can be seen above, four 64-bit NEON registers are used for keeping
103 * intermediate pixel data and up to 8 pixels can be processed in one step
104 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
105 *
106 * This particular function uses the following registers allocation:
107 * d0, d1, d2, d3 - contain loaded source pixel data
108 * d4, d5 - contain loaded destination pixels (they are needed)
109 * d28, d29 - place for storing the result (destination pixels)
110 */
111
112/*
113 * Step one. We need to have some code to do some arithmetics on pixel data.
114 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
115 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
116 * perform all the needed calculations and write the result to {d28, d29}.
117 * The rationale for having two macros and not just one will be explained
118 * later. In practice, any single monolitic function which does the work can
119 * be split into two parts in any arbitrary way without affecting correctness.
120 *
121 * There is one special trick here too. Common template macro can optionally
122 * make our life a bit easier by doing R, G, B, A color components
123 * deinterleaving for 32bpp pixel formats (and this feature is used in
124 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
125 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
126 * actually use d0 register for blue channel (a vector of eight 8-bit
127 * values), d1 register for green, d2 for red and d3 for alpha. This
128 * simple conversion can be also done with a few NEON instructions:
129 *
130 * Packed to planar conversion:
131 * vuzp.8 d0, d1
132 * vuzp.8 d2, d3
133 * vuzp.8 d1, d3
134 * vuzp.8 d0, d2
135 *
136 * Planar to packed conversion:
137 * vzip.8 d0, d2
138 * vzip.8 d1, d3
139 * vzip.8 d2, d3
140 * vzip.8 d0, d1
141 *
142 * But pixel can be loaded directly in planar format using VLD4.8 NEON
143 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
144 * desirable, that's why deinterleaving is optional.
145 *
146 * But anyway, here is the code:
147 */
148.macro pixman_composite_over_8888_0565_process_pixblock_head
149 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
150 and put data into d6 - red, d7 - green, d30 - blue */
151 vshrn.u16 d6, q2, #8
152 vshrn.u16 d7, q2, #3
153 vsli.u16 q2, q2, #5
154 vsri.u8 d6, d6, #5
155 vmvn.8 d3, d3 /* invert source alpha */
156 vsri.u8 d7, d7, #6
157 vshrn.u16 d30, q2, #2
158 /* now do alpha blending, storing results in 8-bit planar format
159 into d16 - red, d19 - green, d18 - blue */
160 vmull.u8 q10, d3, d6
161 vmull.u8 q11, d3, d7
162 vmull.u8 q12, d3, d30
163 vrshr.u16 q13, q10, #8
164 vrshr.u16 q3, q11, #8
165 vrshr.u16 q15, q12, #8
166 vraddhn.u16 d20, q10, q13
167 vraddhn.u16 d23, q11, q3
168 vraddhn.u16 d22, q12, q15
169.endm
170
171.macro pixman_composite_over_8888_0565_process_pixblock_tail
172 /* ... continue alpha blending */
173 vqadd.u8 d16, d2, d20
174 vqadd.u8 q9, q0, q11
175 /* convert the result to r5g6b5 and store it into {d28, d29} */
176 vshll.u8 q14, d16, #8
177 vshll.u8 q8, d19, #8
178 vshll.u8 q9, d18, #8
179 vsri.u16 q14, q8, #5
180 vsri.u16 q14, q9, #11
181.endm
182
183/*
184 * OK, now we got almost everything that we need. Using the above two
185 * macros, the work can be done right. But now we want to optimize
186 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
187 * a lot from good code scheduling and software pipelining.
188 *
189 * Let's construct some code, which will run in the core main loop.
190 * Some pseudo-code of the main loop will look like this:
191 * head
192 * while (...) {
193 * tail
194 * head
195 * }
196 * tail
197 *
198 * It may look a bit weird, but this setup allows to hide instruction
199 * latencies better and also utilize dual-issue capability more
200 * efficiently (make pairs of load-store and ALU instructions).
201 *
202 * So what we need now is a '*_tail_head' macro, which will be used
203 * in the core main loop. A trivial straightforward implementation
204 * of this macro would look like this:
205 *
206 * pixman_composite_over_8888_0565_process_pixblock_tail
207 * vst1.16 {d28, d29}, [DST_W, :128]!
208 * vld1.16 {d4, d5}, [DST_R, :128]!
209 * vld4.32 {d0, d1, d2, d3}, [SRC]!
210 * pixman_composite_over_8888_0565_process_pixblock_head
211 * cache_preload 8, 8
212 *
213 * Now it also got some VLD/VST instructions. We simply can't move from
214 * processing one block of pixels to the other one with just arithmetics.
215 * The previously processed data needs to be written to memory and new
216 * data needs to be fetched. Fortunately, this main loop does not deal
217 * with partial leading/trailing pixels and can load/store a full block
218 * of pixels in a bulk. Additionally, destination buffer is already
219 * 16 bytes aligned here (which is good for performance).
220 *
221 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
222 * are the aliases for ARM registers which are used as pointers for
223 * accessing data. We maintain separate pointers for reading and writing
224 * destination buffer (DST_R and DST_W).
225 *
226 * Another new thing is 'cache_preload' macro. It is used for prefetching
227 * data into CPU L2 cache and improve performance when dealing with large
228 * images which are far larger than cache size. It uses one argument
229 * (actually two, but they need to be the same here) - number of pixels
230 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
231 * details about this macro. Moreover, if good performance is needed
232 * the code from this macro needs to be copied into '*_tail_head' macro
233 * and mixed with the rest of code for optimal instructions scheduling.
234 * We are actually doing it below.
235 *
236 * Now after all the explanations, here is the optimized code.
237 * Different instruction streams (originaling from '*_head', '*_tail'
238 * and 'cache_preload' macro) use different indentation levels for
239 * better readability. Actually taking the code from one of these
240 * indentation levels and ignoring a few VLD/VST instructions would
241 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
242 * macro!
243 */
244
245#if 1
246
247.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
248 vqadd.u8 d16, d2, d20
249 vld1.16 {d4, d5}, [DST_R, :128]!
250 vqadd.u8 q9, q0, q11
251 vshrn.u16 d6, q2, #8
252 vld4.8 {d0, d1, d2, d3}, [SRC]!
253 vshrn.u16 d7, q2, #3
254 vsli.u16 q2, q2, #5
255 vshll.u8 q14, d16, #8
256 PF add PF_X, PF_X, #8
257 vshll.u8 q8, d19, #8
258 PF tst PF_CTL, #0xF
259 vsri.u8 d6, d6, #5
260 PF addne PF_X, PF_X, #8
261 vmvn.8 d3, d3
262 PF subne PF_CTL, PF_CTL, #1
263 vsri.u8 d7, d7, #6
264 vshrn.u16 d30, q2, #2
265 vmull.u8 q10, d3, d6
266 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
267 vmull.u8 q11, d3, d7
268 vmull.u8 q12, d3, d30
269 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
270 vsri.u16 q14, q8, #5
271 PF cmp PF_X, ORIG_W
272 vshll.u8 q9, d18, #8
273 vrshr.u16 q13, q10, #8
274 PF subge PF_X, PF_X, ORIG_W
275 vrshr.u16 q3, q11, #8
276 vrshr.u16 q15, q12, #8
277 PF subges PF_CTL, PF_CTL, #0x10
278 vsri.u16 q14, q9, #11
279 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
280 vraddhn.u16 d20, q10, q13
281 vraddhn.u16 d23, q11, q3
282 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
283 vraddhn.u16 d22, q12, q15
284 vst1.16 {d28, d29}, [DST_W, :128]!
285.endm
286
287#else
288
289/* If we did not care much about the performance, we would just use this... */
290.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
291 pixman_composite_over_8888_0565_process_pixblock_tail
292 vst1.16 {d28, d29}, [DST_W, :128]!
293 vld1.16 {d4, d5}, [DST_R, :128]!
294 vld4.32 {d0, d1, d2, d3}, [SRC]!
295 pixman_composite_over_8888_0565_process_pixblock_head
296 cache_preload 8, 8
297.endm
298
299#endif
300
301/*
302 * And now the final part. We are using 'generate_composite_function' macro
303 * to put all the stuff together. We are specifying the name of the function
304 * which we want to get, number of bits per pixel for the source, mask and
305 * destination (0 if unused, like mask in this case). Next come some bit
306 * flags:
307 * FLAG_DST_READWRITE - tells that the destination buffer is both read
308 * and written, for write-only buffer we would use
309 * FLAG_DST_WRITEONLY flag instead
310 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
311 * and separate color channels for 32bpp format.
312 * The next things are:
313 * - the number of pixels processed per iteration (8 in this case, because
314 * that's the maximum what can fit into four 64-bit NEON registers).
315 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
316 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
317 * prefetch distance can be selected by running some benchmarks.
318 *
319 * After that we specify some macros, these are 'default_init',
320 * 'default_cleanup' here which are empty (but it is possible to have custom
321 * init/cleanup macros to be able to save/restore some extra NEON registers
322 * like d8-d15 or do anything else) followed by
323 * 'pixman_composite_over_8888_0565_process_pixblock_head',
324 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
325 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
326 * which we got implemented above.
327 *
328 * The last part is the NEON registers allocation scheme.
329 */
330generate_composite_function \
331 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
332 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
333 8, /* number of pixels, processed in a single block */ \
334 5, /* prefetch distance */ \
335 default_init, \
336 default_cleanup, \
337 pixman_composite_over_8888_0565_process_pixblock_head, \
338 pixman_composite_over_8888_0565_process_pixblock_tail, \
339 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
340 28, /* dst_w_basereg */ \
341 4, /* dst_r_basereg */ \
342 0, /* src_basereg */ \
343 24 /* mask_basereg */
344
345/******************************************************************************/
346
347.macro pixman_composite_over_n_0565_process_pixblock_head
348 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
349 and put data into d6 - red, d7 - green, d30 - blue */
350 vshrn.u16 d6, q2, #8
351 vshrn.u16 d7, q2, #3
352 vsli.u16 q2, q2, #5
353 vsri.u8 d6, d6, #5
354 vsri.u8 d7, d7, #6
355 vshrn.u16 d30, q2, #2
356 /* now do alpha blending, storing results in 8-bit planar format
357 into d16 - red, d19 - green, d18 - blue */
358 vmull.u8 q10, d3, d6
359 vmull.u8 q11, d3, d7
360 vmull.u8 q12, d3, d30
361 vrshr.u16 q13, q10, #8
362 vrshr.u16 q3, q11, #8
363 vrshr.u16 q15, q12, #8
364 vraddhn.u16 d20, q10, q13
365 vraddhn.u16 d23, q11, q3
366 vraddhn.u16 d22, q12, q15
367.endm
368
369.macro pixman_composite_over_n_0565_process_pixblock_tail
370 /* ... continue alpha blending */
371 vqadd.u8 d16, d2, d20
372 vqadd.u8 q9, q0, q11
373 /* convert the result to r5g6b5 and store it into {d28, d29} */
374 vshll.u8 q14, d16, #8
375 vshll.u8 q8, d19, #8
376 vshll.u8 q9, d18, #8
377 vsri.u16 q14, q8, #5
378 vsri.u16 q14, q9, #11
379.endm
380
381/* TODO: expand macros and do better instructions scheduling */
382.macro pixman_composite_over_n_0565_process_pixblock_tail_head
383 pixman_composite_over_n_0565_process_pixblock_tail
384 vld1.16 {d4, d5}, [DST_R, :128]!
385 vst1.16 {d28, d29}, [DST_W, :128]!
386 pixman_composite_over_n_0565_process_pixblock_head
387.endm
388
389.macro pixman_composite_over_n_0565_init
390 add DUMMY, sp, #ARGS_STACK_OFFSET
391 vld1.32 {d3[0]}, [DUMMY]
392 vdup.8 d0, d3[0]
393 vdup.8 d1, d3[1]
394 vdup.8 d2, d3[2]
395 vdup.8 d3, d3[3]
396 vmvn.8 d3, d3 /* invert source alpha */
397.endm
398
399generate_composite_function \
400 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
401 FLAG_DST_READWRITE, \
402 8, /* number of pixels, processed in a single block */ \
403 5, /* prefetch distance */ \
404 pixman_composite_over_n_0565_init, \
405 default_cleanup, \
406 pixman_composite_over_n_0565_process_pixblock_head, \
407 pixman_composite_over_n_0565_process_pixblock_tail, \
408 pixman_composite_over_n_0565_process_pixblock_tail_head, \
409 28, /* dst_w_basereg */ \
410 4, /* dst_r_basereg */ \
411 0, /* src_basereg */ \
412 24 /* mask_basereg */
413
414/******************************************************************************/
415
416.macro pixman_composite_src_8888_0565_process_pixblock_head
417 vshll.u8 q8, d1, #8
418 vshll.u8 q14, d2, #8
419 vshll.u8 q9, d0, #8
420.endm
421
422.macro pixman_composite_src_8888_0565_process_pixblock_tail
423 vsri.u16 q14, q8, #5
424 vsri.u16 q14, q9, #11
425.endm
426
427.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
428 vsri.u16 q14, q8, #5
429 PF add PF_X, PF_X, #8
430 PF tst PF_CTL, #0xF
431 vld4.8 {d0, d1, d2, d3}, [SRC]!
432 PF addne PF_X, PF_X, #8
433 PF subne PF_CTL, PF_CTL, #1
434 vsri.u16 q14, q9, #11
435 PF cmp PF_X, ORIG_W
436 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
437 vshll.u8 q8, d1, #8
438 vst1.16 {d28, d29}, [DST_W, :128]!
439 PF subge PF_X, PF_X, ORIG_W
440 PF subges PF_CTL, PF_CTL, #0x10
441 vshll.u8 q14, d2, #8
442 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
443 vshll.u8 q9, d0, #8
444.endm
445
446generate_composite_function \
447 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
448 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
449 8, /* number of pixels, processed in a single block */ \
450 10, /* prefetch distance */ \
451 default_init, \
452 default_cleanup, \
453 pixman_composite_src_8888_0565_process_pixblock_head, \
454 pixman_composite_src_8888_0565_process_pixblock_tail, \
455 pixman_composite_src_8888_0565_process_pixblock_tail_head
456
457/******************************************************************************/
458
459.macro pixman_composite_src_0565_8888_process_pixblock_head
460 vshrn.u16 d30, q0, #8
461 vshrn.u16 d29, q0, #3
462 vsli.u16 q0, q0, #5
463 vmov.u8 d31, #255
464 vsri.u8 d30, d30, #5
465 vsri.u8 d29, d29, #6
466 vshrn.u16 d28, q0, #2
467.endm
468
469.macro pixman_composite_src_0565_8888_process_pixblock_tail
470.endm
471
472/* TODO: expand macros and do better instructions scheduling */
473.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
474 pixman_composite_src_0565_8888_process_pixblock_tail
475 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
476 vld1.16 {d0, d1}, [SRC]!
477 pixman_composite_src_0565_8888_process_pixblock_head
478 cache_preload 8, 8
479.endm
480
481generate_composite_function \
482 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
483 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
484 8, /* number of pixels, processed in a single block */ \
485 10, /* prefetch distance */ \
486 default_init, \
487 default_cleanup, \
488 pixman_composite_src_0565_8888_process_pixblock_head, \
489 pixman_composite_src_0565_8888_process_pixblock_tail, \
490 pixman_composite_src_0565_8888_process_pixblock_tail_head
491
492/******************************************************************************/
493
494.macro pixman_composite_add_8000_8000_process_pixblock_head
495 vqadd.u8 q14, q0, q2
496 vqadd.u8 q15, q1, q3
497.endm
498
499.macro pixman_composite_add_8000_8000_process_pixblock_tail
500.endm
501
502.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
503 vld1.8 {d0, d1, d2, d3}, [SRC]!
504 PF add PF_X, PF_X, #32
505 PF tst PF_CTL, #0xF
506 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
507 PF addne PF_X, PF_X, #32
508 PF subne PF_CTL, PF_CTL, #1
509 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
510 PF cmp PF_X, ORIG_W
511 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
512 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
513 PF subge PF_X, PF_X, ORIG_W
514 PF subges PF_CTL, PF_CTL, #0x10
515 vqadd.u8 q14, q0, q2
516 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
517 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
518 vqadd.u8 q15, q1, q3
519.endm
520
521generate_composite_function \
522 pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
523 FLAG_DST_READWRITE, \
524 32, /* number of pixels, processed in a single block */ \
525 10, /* prefetch distance */ \
526 default_init, \
527 default_cleanup, \
528 pixman_composite_add_8000_8000_process_pixblock_head, \
529 pixman_composite_add_8000_8000_process_pixblock_tail, \
530 pixman_composite_add_8000_8000_process_pixblock_tail_head
531
532/******************************************************************************/
533
534.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
535 vld1.8 {d0, d1, d2, d3}, [SRC]!
536 PF add PF_X, PF_X, #8
537 PF tst PF_CTL, #0xF
538 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
539 PF addne PF_X, PF_X, #8
540 PF subne PF_CTL, PF_CTL, #1
541 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
542 PF cmp PF_X, ORIG_W
543 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
544 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
545 PF subge PF_X, PF_X, ORIG_W
546 PF subges PF_CTL, PF_CTL, #0x10
547 vqadd.u8 q14, q0, q2
548 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
549 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
550 vqadd.u8 q15, q1, q3
551.endm
552
553generate_composite_function \
554 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
555 FLAG_DST_READWRITE, \
556 8, /* number of pixels, processed in a single block */ \
557 10, /* prefetch distance */ \
558 default_init, \
559 default_cleanup, \
560 pixman_composite_add_8000_8000_process_pixblock_head, \
561 pixman_composite_add_8000_8000_process_pixblock_tail, \
562 pixman_composite_add_8888_8888_process_pixblock_tail_head
563
564generate_composite_function_single_scanline \
565 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
566 FLAG_DST_READWRITE, \
567 8, /* number of pixels, processed in a single block */ \
568 default_init, \
569 default_cleanup, \
570 pixman_composite_add_8000_8000_process_pixblock_head, \
571 pixman_composite_add_8000_8000_process_pixblock_tail, \
572 pixman_composite_add_8888_8888_process_pixblock_tail_head
573
574/******************************************************************************/
575
576.macro pixman_composite_over_8888_8888_process_pixblock_head
577 vmvn.8 d24, d3 /* get inverted alpha */
578 /* do alpha blending */
579 vmull.u8 q8, d24, d4
580 vmull.u8 q9, d24, d5
581 vmull.u8 q10, d24, d6
582 vmull.u8 q11, d24, d7
583.endm
584
585.macro pixman_composite_over_8888_8888_process_pixblock_tail
586 vrshr.u16 q14, q8, #8
587 vrshr.u16 q15, q9, #8
588 vrshr.u16 q12, q10, #8
589 vrshr.u16 q13, q11, #8
590 vraddhn.u16 d28, q14, q8
591 vraddhn.u16 d29, q15, q9
592 vraddhn.u16 d30, q12, q10
593 vraddhn.u16 d31, q13, q11
594 vqadd.u8 q14, q0, q14
595 vqadd.u8 q15, q1, q15
596.endm
597
598.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
599 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
600 vrshr.u16 q14, q8, #8
601 PF add PF_X, PF_X, #8
602 PF tst PF_CTL, #0xF
603 vrshr.u16 q15, q9, #8
604 vrshr.u16 q12, q10, #8
605 vrshr.u16 q13, q11, #8
606 PF addne PF_X, PF_X, #8
607 PF subne PF_CTL, PF_CTL, #1
608 vraddhn.u16 d28, q14, q8
609 vraddhn.u16 d29, q15, q9
610 PF cmp PF_X, ORIG_W
611 vraddhn.u16 d30, q12, q10
612 vraddhn.u16 d31, q13, q11
613 vqadd.u8 q14, q0, q14
614 vqadd.u8 q15, q1, q15
615 vld4.8 {d0, d1, d2, d3}, [SRC]!
616 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
617 vmvn.8 d22, d3
618 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
619 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
620 PF subge PF_X, PF_X, ORIG_W
621 vmull.u8 q8, d22, d4
622 PF subges PF_CTL, PF_CTL, #0x10
623 vmull.u8 q9, d22, d5
624 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
625 vmull.u8 q10, d22, d6
626 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
627 vmull.u8 q11, d22, d7
628.endm
629
630generate_composite_function \
631 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
632 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
633 8, /* number of pixels, processed in a single block */ \
634 5, /* prefetch distance */ \
635 default_init, \
636 default_cleanup, \
637 pixman_composite_over_8888_8888_process_pixblock_head, \
638 pixman_composite_over_8888_8888_process_pixblock_tail, \
639 pixman_composite_over_8888_8888_process_pixblock_tail_head
640
641generate_composite_function_single_scanline \
642 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
643 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
644 8, /* number of pixels, processed in a single block */ \
645 default_init, \
646 default_cleanup, \
647 pixman_composite_over_8888_8888_process_pixblock_head, \
648 pixman_composite_over_8888_8888_process_pixblock_tail, \
649 pixman_composite_over_8888_8888_process_pixblock_tail_head
650
651/******************************************************************************/
652
653/* TODO: expand macros and do better instructions scheduling */
654.macro pixman_composite_over_n_8888_process_pixblock_tail_head
655 pixman_composite_over_8888_8888_process_pixblock_tail
656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
657 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
658 pixman_composite_over_8888_8888_process_pixblock_head
659.endm
660
661.macro pixman_composite_over_n_8888_init
662 add DUMMY, sp, #ARGS_STACK_OFFSET
663 vld1.32 {d3[0]}, [DUMMY]
664 vdup.8 d0, d3[0]
665 vdup.8 d1, d3[1]
666 vdup.8 d2, d3[2]
667 vdup.8 d3, d3[3]
668.endm
669
670generate_composite_function \
671 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
672 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
673 8, /* number of pixels, processed in a single block */ \
674 5, /* prefetch distance */ \
675 pixman_composite_over_n_8888_init, \
676 default_cleanup, \
677 pixman_composite_over_8888_8888_process_pixblock_head, \
678 pixman_composite_over_8888_8888_process_pixblock_tail, \
679 pixman_composite_over_n_8888_process_pixblock_tail_head
680
681/******************************************************************************/
682
683.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
684 vrshr.u16 q14, q8, #8
685 PF add PF_X, PF_X, #8
686 PF tst PF_CTL, #0xF
687 vrshr.u16 q15, q9, #8
688 vrshr.u16 q12, q10, #8
689 vrshr.u16 q13, q11, #8
690 PF addne PF_X, PF_X, #8
691 PF subne PF_CTL, PF_CTL, #1
692 vraddhn.u16 d28, q14, q8
693 vraddhn.u16 d29, q15, q9
694 PF cmp PF_X, ORIG_W
695 vraddhn.u16 d30, q12, q10
696 vraddhn.u16 d31, q13, q11
697 vqadd.u8 q14, q0, q14
698 vqadd.u8 q15, q1, q15
699 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
700 vmvn.8 d22, d3
701 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
702 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
703 PF subge PF_X, PF_X, ORIG_W
704 vmull.u8 q8, d22, d4
705 PF subges PF_CTL, PF_CTL, #0x10
706 vmull.u8 q9, d22, d5
707 vmull.u8 q10, d22, d6
708 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
709 vmull.u8 q11, d22, d7
710.endm
711
712.macro pixman_composite_over_reverse_n_8888_init
713 add DUMMY, sp, #ARGS_STACK_OFFSET
714 vld1.32 {d7[0]}, [DUMMY]
715 vdup.8 d4, d7[0]
716 vdup.8 d5, d7[1]
717 vdup.8 d6, d7[2]
718 vdup.8 d7, d7[3]
719.endm
720
721generate_composite_function \
722 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
723 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
724 8, /* number of pixels, processed in a single block */ \
725 5, /* prefetch distance */ \
726 pixman_composite_over_reverse_n_8888_init, \
727 default_cleanup, \
728 pixman_composite_over_8888_8888_process_pixblock_head, \
729 pixman_composite_over_8888_8888_process_pixblock_tail, \
730 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
731 28, /* dst_w_basereg */ \
732 0, /* dst_r_basereg */ \
733 4, /* src_basereg */ \
734 24 /* mask_basereg */
735
736/******************************************************************************/
737
738.macro pixman_composite_over_n_8_0565_process_pixblock_head
739 /* in */
740 vmull.u8 q0, d24, d8
741 vmull.u8 q1, d24, d9
742 vmull.u8 q6, d24, d10
743 vmull.u8 q7, d24, d11
744 vrshr.u16 q10, q0, #8
745 vrshr.u16 q11, q1, #8
746 vrshr.u16 q12, q6, #8
747 vrshr.u16 q13, q7, #8
748 vraddhn.u16 d0, q0, q10
749 vraddhn.u16 d1, q1, q11
750 vraddhn.u16 d2, q6, q12
751 vraddhn.u16 d3, q7, q13
752
753 vshrn.u16 d6, q2, #8
754 vshrn.u16 d7, q2, #3
755 vsli.u16 q2, q2, #5
756 vsri.u8 d6, d6, #5
757 vmvn.8 d3, d3
758 vsri.u8 d7, d7, #6
759 vshrn.u16 d30, q2, #2
760 /* now do alpha blending */
761 vmull.u8 q10, d3, d6
762 vmull.u8 q11, d3, d7
763 vmull.u8 q12, d3, d30
764 vrshr.u16 q13, q10, #8
765 vrshr.u16 q3, q11, #8
766 vrshr.u16 q15, q12, #8
767 vraddhn.u16 d20, q10, q13
768 vraddhn.u16 d23, q11, q3
769 vraddhn.u16 d22, q12, q15
770.endm
771
772.macro pixman_composite_over_n_8_0565_process_pixblock_tail
773 vqadd.u8 d16, d2, d20
774 vqadd.u8 q9, q0, q11
775 /* convert to r5g6b5 */
776 vshll.u8 q14, d16, #8
777 vshll.u8 q8, d19, #8
778 vshll.u8 q9, d18, #8
779 vsri.u16 q14, q8, #5
780 vsri.u16 q14, q9, #11
781.endm
782
783/* TODO: expand macros and do better instructions scheduling */
784.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
785 pixman_composite_over_n_8_0565_process_pixblock_tail
786 vst1.16 {d28, d29}, [DST_W, :128]!
787 vld1.16 {d4, d5}, [DST_R, :128]!
788 vld1.8 {d24}, [MASK]!
789 cache_preload 8, 8
790 pixman_composite_over_n_8_0565_process_pixblock_head
791.endm
792
793/*
794 * This function needs a special initialization of solid mask.
795 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
796 * offset, split into color components and replicated in d8-d11
797 * registers. Additionally, this function needs all the NEON registers,
798 * so it has to save d8-d15 registers which are callee saved according
799 * to ABI. These registers are restored from 'cleanup' macro. All the
800 * other NEON registers are caller saved, so can be clobbered freely
801 * without introducing any problems.
802 */
803.macro pixman_composite_over_n_8_0565_init
804 add DUMMY, sp, #ARGS_STACK_OFFSET
805 vpush {d8-d15}
806 vld1.32 {d11[0]}, [DUMMY]
807 vdup.8 d8, d11[0]
808 vdup.8 d9, d11[1]
809 vdup.8 d10, d11[2]
810 vdup.8 d11, d11[3]
811.endm
812
813.macro pixman_composite_over_n_8_0565_cleanup
814 vpop {d8-d15}
815.endm
816
817generate_composite_function \
818 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
819 FLAG_DST_READWRITE, \
820 8, /* number of pixels, processed in a single block */ \
821 5, /* prefetch distance */ \
822 pixman_composite_over_n_8_0565_init, \
823 pixman_composite_over_n_8_0565_cleanup, \
824 pixman_composite_over_n_8_0565_process_pixblock_head, \
825 pixman_composite_over_n_8_0565_process_pixblock_tail, \
826 pixman_composite_over_n_8_0565_process_pixblock_tail_head
827
828/******************************************************************************/
829
830.macro pixman_composite_src_0565_0565_process_pixblock_head
831.endm
832
833.macro pixman_composite_src_0565_0565_process_pixblock_tail
834.endm
835
836.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
837 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
838 vld1.16 {d0, d1, d2, d3}, [SRC]!
839 cache_preload 16, 16
840.endm
841
842generate_composite_function \
843 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
844 FLAG_DST_WRITEONLY, \
845 16, /* number of pixels, processed in a single block */ \
846 10, /* prefetch distance */ \
847 default_init, \
848 default_cleanup, \
849 pixman_composite_src_0565_0565_process_pixblock_head, \
850 pixman_composite_src_0565_0565_process_pixblock_tail, \
851 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
852 0, /* dst_w_basereg */ \
853 0, /* dst_r_basereg */ \
854 0, /* src_basereg */ \
855 0 /* mask_basereg */
856
857/******************************************************************************/
858
859.macro pixman_composite_src_n_8_process_pixblock_head
860.endm
861
862.macro pixman_composite_src_n_8_process_pixblock_tail
863.endm
864
865.macro pixman_composite_src_n_8_process_pixblock_tail_head
866 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
867.endm
868
869.macro pixman_composite_src_n_8_init
870 add DUMMY, sp, #ARGS_STACK_OFFSET
871 vld1.32 {d0[0]}, [DUMMY]
872 vsli.u64 d0, d0, #8
873 vsli.u64 d0, d0, #16
874 vsli.u64 d0, d0, #32
875 vmov d1, d0
876 vmov q1, q0
877.endm
878
879.macro pixman_composite_src_n_8_cleanup
880.endm
881
882generate_composite_function \
883 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
884 FLAG_DST_WRITEONLY, \
885 32, /* number of pixels, processed in a single block */ \
886 0, /* prefetch distance */ \
887 pixman_composite_src_n_8_init, \
888 pixman_composite_src_n_8_cleanup, \
889 pixman_composite_src_n_8_process_pixblock_head, \
890 pixman_composite_src_n_8_process_pixblock_tail, \
891 pixman_composite_src_n_8_process_pixblock_tail_head, \
892 0, /* dst_w_basereg */ \
893 0, /* dst_r_basereg */ \
894 0, /* src_basereg */ \
895 0 /* mask_basereg */
896
897/******************************************************************************/
898
899.macro pixman_composite_src_n_0565_process_pixblock_head
900.endm
901
902.macro pixman_composite_src_n_0565_process_pixblock_tail
903.endm
904
905.macro pixman_composite_src_n_0565_process_pixblock_tail_head
906 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
907.endm
908
909.macro pixman_composite_src_n_0565_init
910 add DUMMY, sp, #ARGS_STACK_OFFSET
911 vld1.32 {d0[0]}, [DUMMY]
912 vsli.u64 d0, d0, #16
913 vsli.u64 d0, d0, #32
914 vmov d1, d0
915 vmov q1, q0
916.endm
917
918.macro pixman_composite_src_n_0565_cleanup
919.endm
920
921generate_composite_function \
922 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
923 FLAG_DST_WRITEONLY, \
924 16, /* number of pixels, processed in a single block */ \
925 0, /* prefetch distance */ \
926 pixman_composite_src_n_0565_init, \
927 pixman_composite_src_n_0565_cleanup, \
928 pixman_composite_src_n_0565_process_pixblock_head, \
929 pixman_composite_src_n_0565_process_pixblock_tail, \
930 pixman_composite_src_n_0565_process_pixblock_tail_head, \
931 0, /* dst_w_basereg */ \
932 0, /* dst_r_basereg */ \
933 0, /* src_basereg */ \
934 0 /* mask_basereg */
935
936/******************************************************************************/
937
938.macro pixman_composite_src_n_8888_process_pixblock_head
939.endm
940
941.macro pixman_composite_src_n_8888_process_pixblock_tail
942.endm
943
944.macro pixman_composite_src_n_8888_process_pixblock_tail_head
945 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
946.endm
947
948.macro pixman_composite_src_n_8888_init
949 add DUMMY, sp, #ARGS_STACK_OFFSET
950 vld1.32 {d0[0]}, [DUMMY]
951 vsli.u64 d0, d0, #32
952 vmov d1, d0
953 vmov q1, q0
954.endm
955
956.macro pixman_composite_src_n_8888_cleanup
957.endm
958
959generate_composite_function \
960 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
961 FLAG_DST_WRITEONLY, \
962 8, /* number of pixels, processed in a single block */ \
963 0, /* prefetch distance */ \
964 pixman_composite_src_n_8888_init, \
965 pixman_composite_src_n_8888_cleanup, \
966 pixman_composite_src_n_8888_process_pixblock_head, \
967 pixman_composite_src_n_8888_process_pixblock_tail, \
968 pixman_composite_src_n_8888_process_pixblock_tail_head, \
969 0, /* dst_w_basereg */ \
970 0, /* dst_r_basereg */ \
971 0, /* src_basereg */ \
972 0 /* mask_basereg */
973
974/******************************************************************************/
975
976.macro pixman_composite_src_8888_8888_process_pixblock_head
977.endm
978
979.macro pixman_composite_src_8888_8888_process_pixblock_tail
980.endm
981
982.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
983 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
984 vld1.32 {d0, d1, d2, d3}, [SRC]!
985 cache_preload 8, 8
986.endm
987
988generate_composite_function \
989 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
990 FLAG_DST_WRITEONLY, \
991 8, /* number of pixels, processed in a single block */ \
992 10, /* prefetch distance */ \
993 default_init, \
994 default_cleanup, \
995 pixman_composite_src_8888_8888_process_pixblock_head, \
996 pixman_composite_src_8888_8888_process_pixblock_tail, \
997 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
998 0, /* dst_w_basereg */ \
999 0, /* dst_r_basereg */ \
1000 0, /* src_basereg */ \
1001 0 /* mask_basereg */
1002
1003/******************************************************************************/
1004
1005.macro pixman_composite_src_x888_8888_process_pixblock_head
1006 vorr q0, q0, q2
1007 vorr q1, q1, q2
1008.endm
1009
1010.macro pixman_composite_src_x888_8888_process_pixblock_tail
1011.endm
1012
1013.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1014 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1015 vld1.32 {d0, d1, d2, d3}, [SRC]!
1016 vorr q0, q0, q2
1017 vorr q1, q1, q2
1018 cache_preload 8, 8
1019.endm
1020
1021.macro pixman_composite_src_x888_8888_init
1022 vmov.u8 q2, #0xFF
1023 vshl.u32 q2, q2, #24
1024.endm
1025
1026generate_composite_function \
1027 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1028 FLAG_DST_WRITEONLY, \
1029 8, /* number of pixels, processed in a single block */ \
1030 10, /* prefetch distance */ \
1031 pixman_composite_src_x888_8888_init, \
1032 default_cleanup, \
1033 pixman_composite_src_x888_8888_process_pixblock_head, \
1034 pixman_composite_src_x888_8888_process_pixblock_tail, \
1035 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1036 0, /* dst_w_basereg */ \
1037 0, /* dst_r_basereg */ \
1038 0, /* src_basereg */ \
1039 0 /* mask_basereg */
1040
1041/******************************************************************************/
1042
1043.macro pixman_composite_over_n_8_8888_process_pixblock_head
1044 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1045 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1046 /* and destination data in {d4, d5, d6, d7} */
1047 /* mask is in d24 (d25, d26, d27 are unused) */
1048
1049 /* in */
1050 vmull.u8 q0, d24, d8
1051 vmull.u8 q1, d24, d9
1052 vmull.u8 q6, d24, d10
1053 vmull.u8 q7, d24, d11
1054 vrshr.u16 q10, q0, #8
1055 vrshr.u16 q11, q1, #8
1056 vrshr.u16 q12, q6, #8
1057 vrshr.u16 q13, q7, #8
1058 vraddhn.u16 d0, q0, q10
1059 vraddhn.u16 d1, q1, q11
1060 vraddhn.u16 d2, q6, q12
1061 vraddhn.u16 d3, q7, q13
1062 vmvn.8 d24, d3 /* get inverted alpha */
1063 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1064 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1065 /* now do alpha blending */
1066 vmull.u8 q8, d24, d4
1067 vmull.u8 q9, d24, d5
1068 vmull.u8 q10, d24, d6
1069 vmull.u8 q11, d24, d7
1070.endm
1071
1072.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1073 vrshr.u16 q14, q8, #8
1074 vrshr.u16 q15, q9, #8
1075 vrshr.u16 q12, q10, #8
1076 vrshr.u16 q13, q11, #8
1077 vraddhn.u16 d28, q14, q8
1078 vraddhn.u16 d29, q15, q9
1079 vraddhn.u16 d30, q12, q10
1080 vraddhn.u16 d31, q13, q11
1081 vqadd.u8 q14, q0, q14
1082 vqadd.u8 q15, q1, q15
1083.endm
1084
1085/* TODO: expand macros and do better instructions scheduling */
1086.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1087 pixman_composite_over_n_8_8888_process_pixblock_tail
1088 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1089 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1090 vld1.8 {d24}, [MASK]!
1091 cache_preload 8, 8
1092 pixman_composite_over_n_8_8888_process_pixblock_head
1093.endm
1094
1095.macro pixman_composite_over_n_8_8888_init
1096 add DUMMY, sp, #ARGS_STACK_OFFSET
1097 vpush {d8-d15}
1098 vld1.32 {d11[0]}, [DUMMY]
1099 vdup.8 d8, d11[0]
1100 vdup.8 d9, d11[1]
1101 vdup.8 d10, d11[2]
1102 vdup.8 d11, d11[3]
1103.endm
1104
1105.macro pixman_composite_over_n_8_8888_cleanup
1106 vpop {d8-d15}
1107.endm
1108
1109generate_composite_function \
1110 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1111 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1112 8, /* number of pixels, processed in a single block */ \
1113 5, /* prefetch distance */ \
1114 pixman_composite_over_n_8_8888_init, \
1115 pixman_composite_over_n_8_8888_cleanup, \
1116 pixman_composite_over_n_8_8888_process_pixblock_head, \
1117 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1118 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1119
1120/******************************************************************************/
1121
1122.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1123 /*
1124 * 'combine_mask_ca' replacement
1125 *
1126 * input: solid src (n) in {d8, d9, d10, d11}
1127 * dest in {d4, d5, d6, d7 }
1128 * mask in {d24, d25, d26, d27}
1129 * output: updated src in {d0, d1, d2, d3 }
1130 * updated mask in {d24, d25, d26, d3 }
1131 */
1132 vmull.u8 q0, d24, d8
1133 vmull.u8 q1, d25, d9
1134 vmull.u8 q6, d26, d10
1135 vmull.u8 q7, d27, d11
1136 vmull.u8 q9, d11, d25
1137 vmull.u8 q12, d11, d24
1138 vmull.u8 q13, d11, d26
1139 vrshr.u16 q8, q0, #8
1140 vrshr.u16 q10, q1, #8
1141 vrshr.u16 q11, q6, #8
1142 vraddhn.u16 d0, q0, q8
1143 vraddhn.u16 d1, q1, q10
1144 vraddhn.u16 d2, q6, q11
1145 vrshr.u16 q11, q12, #8
1146 vrshr.u16 q8, q9, #8
1147 vrshr.u16 q6, q13, #8
1148 vrshr.u16 q10, q7, #8
1149 vraddhn.u16 d24, q12, q11
1150 vraddhn.u16 d25, q9, q8
1151 vraddhn.u16 d26, q13, q6
1152 vraddhn.u16 d3, q7, q10
1153 /*
1154 * 'combine_over_ca' replacement
1155 *
1156 * output: updated dest in {d28, d29, d30, d31}
1157 */
1158 vmvn.8 d24, d24
1159 vmvn.8 d25, d25
1160 vmull.u8 q8, d24, d4
1161 vmull.u8 q9, d25, d5
1162 vmvn.8 d26, d26
1163 vmvn.8 d27, d3
1164 vmull.u8 q10, d26, d6
1165 vmull.u8 q11, d27, d7
1166.endm
1167
1168.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1169 /* ... continue 'combine_over_ca' replacement */
1170 vrshr.u16 q14, q8, #8
1171 vrshr.u16 q15, q9, #8
1172 vrshr.u16 q6, q10, #8
1173 vrshr.u16 q7, q11, #8
1174 vraddhn.u16 d28, q14, q8
1175 vraddhn.u16 d29, q15, q9
1176 vraddhn.u16 d30, q6, q10
1177 vraddhn.u16 d31, q7, q11
1178 vqadd.u8 q14, q0, q14
1179 vqadd.u8 q15, q1, q15
1180.endm
1181
1182.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1183 vrshr.u16 q14, q8, #8
1184 vrshr.u16 q15, q9, #8
1185 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1186 vrshr.u16 q6, q10, #8
1187 vrshr.u16 q7, q11, #8
1188 vraddhn.u16 d28, q14, q8
1189 vraddhn.u16 d29, q15, q9
1190 vraddhn.u16 d30, q6, q10
1191 vraddhn.u16 d31, q7, q11
1192 vld4.8 {d24, d25, d26, d27}, [MASK]!
1193 vqadd.u8 q14, q0, q14
1194 vqadd.u8 q15, q1, q15
1195 cache_preload 8, 8
1196 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1197 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1198.endm
1199
1200.macro pixman_composite_over_n_8888_8888_ca_init
1201 add DUMMY, sp, #ARGS_STACK_OFFSET
1202 vpush {d8-d15}
1203 vld1.32 {d11[0]}, [DUMMY]
1204 vdup.8 d8, d11[0]
1205 vdup.8 d9, d11[1]
1206 vdup.8 d10, d11[2]
1207 vdup.8 d11, d11[3]
1208.endm
1209
1210.macro pixman_composite_over_n_8888_8888_ca_cleanup
1211 vpop {d8-d15}
1212.endm
1213
1214generate_composite_function \
1215 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1216 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1217 8, /* number of pixels, processed in a single block */ \
1218 5, /* prefetch distance */ \
1219 pixman_composite_over_n_8888_8888_ca_init, \
1220 pixman_composite_over_n_8888_8888_ca_cleanup, \
1221 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1222 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1223 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1224
1225/******************************************************************************/
1226
1227.macro pixman_composite_add_n_8_8_process_pixblock_head
1228 /* expecting source data in {d8, d9, d10, d11} */
1229 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1230 /* and destination data in {d4, d5, d6, d7} */
1231 /* mask is in d24, d25, d26, d27 */
1232 vmull.u8 q0, d24, d11
1233 vmull.u8 q1, d25, d11
1234 vmull.u8 q6, d26, d11
1235 vmull.u8 q7, d27, d11
1236 vrshr.u16 q10, q0, #8
1237 vrshr.u16 q11, q1, #8
1238 vrshr.u16 q12, q6, #8
1239 vrshr.u16 q13, q7, #8
1240 vraddhn.u16 d0, q0, q10
1241 vraddhn.u16 d1, q1, q11
1242 vraddhn.u16 d2, q6, q12
1243 vraddhn.u16 d3, q7, q13
1244 vqadd.u8 q14, q0, q2
1245 vqadd.u8 q15, q1, q3
1246.endm
1247
1248.macro pixman_composite_add_n_8_8_process_pixblock_tail
1249.endm
1250
1251/* TODO: expand macros and do better instructions scheduling */
1252.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1253 pixman_composite_add_n_8_8_process_pixblock_tail
1254 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1255 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1256 vld1.8 {d24, d25, d26, d27}, [MASK]!
1257 cache_preload 32, 32
1258 pixman_composite_add_n_8_8_process_pixblock_head
1259.endm
1260
1261.macro pixman_composite_add_n_8_8_init
1262 add DUMMY, sp, #ARGS_STACK_OFFSET
1263 vpush {d8-d15}
1264 vld1.32 {d11[0]}, [DUMMY]
1265 vdup.8 d11, d11[3]
1266.endm
1267
1268.macro pixman_composite_add_n_8_8_cleanup
1269 vpop {d8-d15}
1270.endm
1271
1272generate_composite_function \
1273 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1274 FLAG_DST_READWRITE, \
1275 32, /* number of pixels, processed in a single block */ \
1276 5, /* prefetch distance */ \
1277 pixman_composite_add_n_8_8_init, \
1278 pixman_composite_add_n_8_8_cleanup, \
1279 pixman_composite_add_n_8_8_process_pixblock_head, \
1280 pixman_composite_add_n_8_8_process_pixblock_tail, \
1281 pixman_composite_add_n_8_8_process_pixblock_tail_head
1282
1283/******************************************************************************/
1284
1285.macro pixman_composite_add_8_8_8_process_pixblock_head
1286 /* expecting source data in {d0, d1, d2, d3} */
1287 /* destination data in {d4, d5, d6, d7} */
1288 /* mask in {d24, d25, d26, d27} */
1289 vmull.u8 q8, d24, d0
1290 vmull.u8 q9, d25, d1
1291 vmull.u8 q10, d26, d2
1292 vmull.u8 q11, d27, d3
1293 vrshr.u16 q0, q8, #8
1294 vrshr.u16 q1, q9, #8
1295 vrshr.u16 q12, q10, #8
1296 vrshr.u16 q13, q11, #8
1297 vraddhn.u16 d0, q0, q8
1298 vraddhn.u16 d1, q1, q9
1299 vraddhn.u16 d2, q12, q10
1300 vraddhn.u16 d3, q13, q11
1301 vqadd.u8 q14, q0, q2
1302 vqadd.u8 q15, q1, q3
1303.endm
1304
1305.macro pixman_composite_add_8_8_8_process_pixblock_tail
1306.endm
1307
1308/* TODO: expand macros and do better instructions scheduling */
1309.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1310 pixman_composite_add_8_8_8_process_pixblock_tail
1311 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1312 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1313 vld1.8 {d24, d25, d26, d27}, [MASK]!
1314 vld1.8 {d0, d1, d2, d3}, [SRC]!
1315 cache_preload 32, 32
1316 pixman_composite_add_8_8_8_process_pixblock_head
1317.endm
1318
1319.macro pixman_composite_add_8_8_8_init
1320.endm
1321
1322.macro pixman_composite_add_8_8_8_cleanup
1323.endm
1324
1325generate_composite_function \
1326 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1327 FLAG_DST_READWRITE, \
1328 32, /* number of pixels, processed in a single block */ \
1329 5, /* prefetch distance */ \
1330 pixman_composite_add_8_8_8_init, \
1331 pixman_composite_add_8_8_8_cleanup, \
1332 pixman_composite_add_8_8_8_process_pixblock_head, \
1333 pixman_composite_add_8_8_8_process_pixblock_tail, \
1334 pixman_composite_add_8_8_8_process_pixblock_tail_head
1335
1336/******************************************************************************/
1337
1338.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1339 /* expecting source data in {d0, d1, d2, d3} */
1340 /* destination data in {d4, d5, d6, d7} */
1341 /* mask in {d24, d25, d26, d27} */
1342 vmull.u8 q8, d27, d0
1343 vmull.u8 q9, d27, d1
1344 vmull.u8 q10, d27, d2
1345 vmull.u8 q11, d27, d3
1346 vrshr.u16 q0, q8, #8
1347 vrshr.u16 q1, q9, #8
1348 vrshr.u16 q12, q10, #8
1349 vrshr.u16 q13, q11, #8
1350 vraddhn.u16 d0, q0, q8
1351 vraddhn.u16 d1, q1, q9
1352 vraddhn.u16 d2, q12, q10
1353 vraddhn.u16 d3, q13, q11
1354 vqadd.u8 q14, q0, q2
1355 vqadd.u8 q15, q1, q3
1356.endm
1357
1358.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1359.endm
1360
1361/* TODO: expand macros and do better instructions scheduling */
1362.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1363 pixman_composite_add_8888_8888_8888_process_pixblock_tail
1364 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1365 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1366 vld4.8 {d24, d25, d26, d27}, [MASK]!
1367 vld4.8 {d0, d1, d2, d3}, [SRC]!
1368 cache_preload 8, 8
1369 pixman_composite_add_8888_8888_8888_process_pixblock_head
1370.endm
1371
1372generate_composite_function \
1373 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1374 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1375 8, /* number of pixels, processed in a single block */ \
1376 10, /* prefetch distance */ \
1377 default_init, \
1378 default_cleanup, \
1379 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1380 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1381 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1382
1383generate_composite_function_single_scanline \
1384 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1385 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1386 8, /* number of pixels, processed in a single block */ \
1387 default_init, \
1388 default_cleanup, \
1389 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1390 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1391 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1392
1393/******************************************************************************/
1394
1395.macro pixman_composite_over_8888_n_8888_process_pixblock_head
1396 /* expecting source data in {d0, d1, d2, d3} */
1397 /* destination data in {d4, d5, d6, d7} */
1398 /* solid mask is in d15 */
1399
1400 /* 'in' */
1401 vmull.u8 q8, d15, d3
1402 vmull.u8 q6, d15, d2
1403 vmull.u8 q5, d15, d1
1404 vmull.u8 q4, d15, d0
1405 vrshr.u16 q13, q8, #8
1406 vrshr.u16 q12, q6, #8
1407 vrshr.u16 q11, q5, #8
1408 vrshr.u16 q10, q4, #8
1409 vraddhn.u16 d3, q8, q13
1410 vraddhn.u16 d2, q6, q12
1411 vraddhn.u16 d1, q5, q11
1412 vraddhn.u16 d0, q4, q10
1413 vmvn.8 d24, d3 /* get inverted alpha */
1414 /* now do alpha blending */
1415 vmull.u8 q8, d24, d4
1416 vmull.u8 q9, d24, d5
1417 vmull.u8 q10, d24, d6
1418 vmull.u8 q11, d24, d7
1419.endm
1420
1421.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1422 vrshr.u16 q14, q8, #8
1423 vrshr.u16 q15, q9, #8
1424 vrshr.u16 q12, q10, #8
1425 vrshr.u16 q13, q11, #8
1426 vraddhn.u16 d28, q14, q8
1427 vraddhn.u16 d29, q15, q9
1428 vraddhn.u16 d30, q12, q10
1429 vraddhn.u16 d31, q13, q11
1430 vqadd.u8 q14, q0, q14
1431 vqadd.u8 q15, q1, q15
1432.endm
1433
1434/* TODO: expand macros and do better instructions scheduling */
1435.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1436 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1437 pixman_composite_over_8888_n_8888_process_pixblock_tail
1438 vld4.8 {d0, d1, d2, d3}, [SRC]!
1439 cache_preload 8, 8
1440 pixman_composite_over_8888_n_8888_process_pixblock_head
1441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1442.endm
1443
1444.macro pixman_composite_over_8888_n_8888_init
1445 add DUMMY, sp, #48
1446 vpush {d8-d15}
1447 vld1.32 {d15[0]}, [DUMMY]
1448 vdup.8 d15, d15[3]
1449.endm
1450
1451.macro pixman_composite_over_8888_n_8888_cleanup
1452 vpop {d8-d15}
1453.endm
1454
1455generate_composite_function \
1456 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1457 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1458 8, /* number of pixels, processed in a single block */ \
1459 5, /* prefetch distance */ \
1460 pixman_composite_over_8888_n_8888_init, \
1461 pixman_composite_over_8888_n_8888_cleanup, \
1462 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1463 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1464 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1465
1466/******************************************************************************/
1467
1468/* TODO: expand macros and do better instructions scheduling */
1469.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1470 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1471 pixman_composite_over_8888_n_8888_process_pixblock_tail
1472 vld4.8 {d0, d1, d2, d3}, [SRC]!
1473 cache_preload 8, 8
1474 vld4.8 {d12, d13, d14, d15}, [MASK]!
1475 pixman_composite_over_8888_n_8888_process_pixblock_head
1476 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1477.endm
1478
1479.macro pixman_composite_over_8888_8888_8888_init
1480 vpush {d8-d15}
1481.endm
1482
1483.macro pixman_composite_over_8888_8888_8888_cleanup
1484 vpop {d8-d15}
1485.endm
1486
1487generate_composite_function \
1488 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1489 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1490 8, /* number of pixels, processed in a single block */ \
1491 5, /* prefetch distance */ \
1492 pixman_composite_over_8888_8888_8888_init, \
1493 pixman_composite_over_8888_8888_8888_cleanup, \
1494 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1495 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1496 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1497 28, /* dst_w_basereg */ \
1498 4, /* dst_r_basereg */ \
1499 0, /* src_basereg */ \
1500 12 /* mask_basereg */
1501
1502generate_composite_function_single_scanline \
1503 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1504 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1505 8, /* number of pixels, processed in a single block */ \
1506 pixman_composite_over_8888_8888_8888_init, \
1507 pixman_composite_over_8888_8888_8888_cleanup, \
1508 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1509 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1510 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1511 28, /* dst_w_basereg */ \
1512 4, /* dst_r_basereg */ \
1513 0, /* src_basereg */ \
1514 12 /* mask_basereg */
1515
1516/******************************************************************************/
1517
1518/* TODO: expand macros and do better instructions scheduling */
1519.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1520 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1521 pixman_composite_over_8888_n_8888_process_pixblock_tail
1522 vld4.8 {d0, d1, d2, d3}, [SRC]!
1523 cache_preload 8, 8
1524 vld1.8 {d15}, [MASK]!
1525 pixman_composite_over_8888_n_8888_process_pixblock_head
1526 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1527.endm
1528
1529.macro pixman_composite_over_8888_8_8888_init
1530 vpush {d8-d15}
1531.endm
1532
1533.macro pixman_composite_over_8888_8_8888_cleanup
1534 vpop {d8-d15}
1535.endm
1536
1537generate_composite_function \
1538 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1539 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1540 8, /* number of pixels, processed in a single block */ \
1541 5, /* prefetch distance */ \
1542 pixman_composite_over_8888_8_8888_init, \
1543 pixman_composite_over_8888_8_8888_cleanup, \
1544 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1545 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1546 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1547 28, /* dst_w_basereg */ \
1548 4, /* dst_r_basereg */ \
1549 0, /* src_basereg */ \
1550 15 /* mask_basereg */
1551
1552/******************************************************************************/
1553
1554.macro pixman_composite_src_0888_0888_process_pixblock_head
1555.endm
1556
1557.macro pixman_composite_src_0888_0888_process_pixblock_tail
1558.endm
1559
1560.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1561 vst3.8 {d0, d1, d2}, [DST_W]!
1562 vld3.8 {d0, d1, d2}, [SRC]!
1563 cache_preload 8, 8
1564.endm
1565
1566generate_composite_function \
1567 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1568 FLAG_DST_WRITEONLY, \
1569 8, /* number of pixels, processed in a single block */ \
1570 10, /* prefetch distance */ \
1571 default_init, \
1572 default_cleanup, \
1573 pixman_composite_src_0888_0888_process_pixblock_head, \
1574 pixman_composite_src_0888_0888_process_pixblock_tail, \
1575 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1576 0, /* dst_w_basereg */ \
1577 0, /* dst_r_basereg */ \
1578 0, /* src_basereg */ \
1579 0 /* mask_basereg */
1580
1581/******************************************************************************/
1582
1583.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1584 vswp d0, d2
1585.endm
1586
1587.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1588.endm
1589
1590.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1591 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1592 vld3.8 {d0, d1, d2}, [SRC]!
1593 vswp d0, d2
1594 cache_preload 8, 8
1595.endm
1596
1597.macro pixman_composite_src_0888_8888_rev_init
1598 veor d3, d3, d3
1599.endm
1600
1601generate_composite_function \
1602 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1603 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1604 8, /* number of pixels, processed in a single block */ \
1605 10, /* prefetch distance */ \
1606 pixman_composite_src_0888_8888_rev_init, \
1607 default_cleanup, \
1608 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1609 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1610 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1611 0, /* dst_w_basereg */ \
1612 0, /* dst_r_basereg */ \
1613 0, /* src_basereg */ \
1614 0 /* mask_basereg */
1615
1616/******************************************************************************/
1617
1618.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1619 vshll.u8 q8, d1, #8
1620 vshll.u8 q9, d2, #8
1621.endm
1622
1623.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1624 vshll.u8 q14, d0, #8
1625 vsri.u16 q14, q8, #5
1626 vsri.u16 q14, q9, #11
1627.endm
1628
1629.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1630 vshll.u8 q14, d0, #8
1631 vld3.8 {d0, d1, d2}, [SRC]!
1632 vsri.u16 q14, q8, #5
1633 vsri.u16 q14, q9, #11
1634 vshll.u8 q8, d1, #8
1635 vst1.16 {d28, d29}, [DST_W, :128]!
1636 vshll.u8 q9, d2, #8
1637.endm
1638
1639generate_composite_function \
1640 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1641 FLAG_DST_WRITEONLY, \
1642 8, /* number of pixels, processed in a single block */ \
1643 10, /* prefetch distance */ \
1644 default_init, \
1645 default_cleanup, \
1646 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1647 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1648 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1649 28, /* dst_w_basereg */ \
1650 0, /* dst_r_basereg */ \
1651 0, /* src_basereg */ \
1652 0 /* mask_basereg */
1653
1654/******************************************************************************/
1655
1656.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1657 vmull.u8 q8, d3, d0
1658 vmull.u8 q9, d3, d1
1659 vmull.u8 q10, d3, d2
1660.endm
1661
1662.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1663 vrshr.u16 q11, q8, #8
1664 vswp d3, d31
1665 vrshr.u16 q12, q9, #8
1666 vrshr.u16 q13, q10, #8
1667 vraddhn.u16 d30, q11, q8
1668 vraddhn.u16 d29, q12, q9
1669 vraddhn.u16 d28, q13, q10
1670.endm
1671
1672.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1673 vrshr.u16 q11, q8, #8
1674 vswp d3, d31
1675 vrshr.u16 q12, q9, #8
1676 vrshr.u16 q13, q10, #8
1677 vld4.8 {d0, d1, d2, d3}, [SRC]!
1678 vraddhn.u16 d30, q11, q8
1679 PF add PF_X, PF_X, #8
1680 PF tst PF_CTL, #0xF
1681 PF addne PF_X, PF_X, #8
1682 PF subne PF_CTL, PF_CTL, #1
1683 vraddhn.u16 d29, q12, q9
1684 vraddhn.u16 d28, q13, q10
1685 vmull.u8 q8, d3, d0
1686 vmull.u8 q9, d3, d1
1687 vmull.u8 q10, d3, d2
1688 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1689 PF cmp PF_X, ORIG_W
1690 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1691 PF subge PF_X, PF_X, ORIG_W
1692 PF subges PF_CTL, PF_CTL, #0x10
1693 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1694.endm
1695
1696generate_composite_function \
1697 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1698 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1699 8, /* number of pixels, processed in a single block */ \
1700 10, /* prefetch distance */ \
1701 default_init, \
1702 default_cleanup, \
1703 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1704 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1705 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1706 28, /* dst_w_basereg */ \
1707 0, /* dst_r_basereg */ \
1708 0, /* src_basereg */ \
1709 0 /* mask_basereg */
Note: See TracBrowser for help on using the repository browser.