source: trunk/src/3rdparty/pixman/pixman-arm-neon-asm.h@ 873

Last change on this file since 873 was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 30.8 KB
Line 
1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains a macro ('generate_composite_function') which can
28 * construct 2D image processing functions, based on a common template.
29 * Any combinations of source, destination and mask images with 8bpp,
30 * 16bpp, 24bpp, 32bpp color formats are supported.
31 *
32 * This macro takes care of:
33 * - handling of leading and trailing unaligned pixels
34 * - doing most of the work related to L2 cache preload
35 * - encourages the use of software pipelining for better instructions
36 * scheduling
37 *
38 * The user of this macro has to provide some configuration parameters
39 * (bit depths for the images, prefetch distance, etc.) and a set of
40 * macros, which should implement basic code chunks responsible for
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
42 * examples.
43 *
44 * TODO:
45 * - try overlapped pixel method (from Ian Rickards) when processing
46 * exactly two blocks of pixels
47 * - maybe add an option to do reverse scanline processing
48 */
49
50/*
51 * Bit flags for 'generate_composite_function' macro which are used
52 * to tune generated functions behavior.
53 */
54.set FLAG_DST_WRITEONLY, 0
55.set FLAG_DST_READWRITE, 1
56.set FLAG_DEINTERLEAVE_32BPP, 2
57
58/*
59 * Offset in stack where mask and source pointer/stride can be accessed
60 * from 'init' macro. This is useful for doing special handling for solid mask.
61 */
62.set ARGS_STACK_OFFSET, 40
63
64/*
65 * Constants for selecting preferable prefetch type.
66 */
67.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
68.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
69.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
70
71/*
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of
73 * pixel data).
74 */
75
76.macro pixldst1 op, elem_size, reg1, mem_operand, abits
77.if abits > 0
78 op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
79.else
80 op&.&elem_size {d&reg1}, [&mem_operand&]!
81.endif
82.endm
83
84.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
85.if abits > 0
86 op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
87.else
88 op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
89.endif
90.endm
91
92.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
93.if abits > 0
94 op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
95.else
96 op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
97.endif
98.endm
99
100.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
101 op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
102.endm
103
104.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
105 op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
106.endm
107
108.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
109 op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
110.endm
111
112.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
113.if numbytes == 32
114 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
115 %(basereg+6), %(basereg+7), mem_operand, abits
116.elseif numbytes == 16
117 pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
118.elseif numbytes == 8
119 pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
120.elseif numbytes == 4
121 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
122 pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
123 .elseif elem_size == 16
124 pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
125 pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
126 .else
127 pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
128 pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
129 pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
130 pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
131 .endif
132.elseif numbytes == 2
133 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
134 pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
135 .else
136 pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
137 pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
138 .endif
139.elseif numbytes == 1
140 pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
141.else
142 .error "unsupported size: numbytes"
143.endif
144.endm
145
146.macro pixld numpix, bpp, basereg, mem_operand, abits=0
147.if bpp > 0
148.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
149 pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
150 %(basereg+6), %(basereg+7), mem_operand, abits
151.elseif (bpp == 24) && (numpix == 8)
152 pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
153.elseif (bpp == 24) && (numpix == 4)
154 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
155 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
156 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
157 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
158.elseif (bpp == 24) && (numpix == 2)
159 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
160 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
161.elseif (bpp == 24) && (numpix == 1)
162 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
163.else
164 pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
165.endif
166.endif
167.endm
168
169.macro pixst numpix, bpp, basereg, mem_operand, abits=0
170.if bpp > 0
171.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
172 pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
173 %(basereg+6), %(basereg+7), mem_operand, abits
174.elseif (bpp == 24) && (numpix == 8)
175 pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
176.elseif (bpp == 24) && (numpix == 4)
177 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
178 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
179 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
180 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
181.elseif (bpp == 24) && (numpix == 2)
182 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
183 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
184.elseif (bpp == 24) && (numpix == 1)
185 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
186.else
187 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
188.endif
189.endif
190.endm
191
192.macro pixld_a numpix, bpp, basereg, mem_operand
193.if (bpp * numpix) <= 128
194 pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
195.else
196 pixld numpix, bpp, basereg, mem_operand, 128
197.endif
198.endm
199
200.macro pixst_a numpix, bpp, basereg, mem_operand
201.if (bpp * numpix) <= 128
202 pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
203.else
204 pixst numpix, bpp, basereg, mem_operand, 128
205.endif
206.endm
207
208.macro vuzp8 reg1, reg2
209 vuzp.8 d&reg1, d&reg2
210.endm
211
212.macro vzip8 reg1, reg2
213 vzip.8 d&reg1, d&reg2
214.endm
215
216/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
217.macro pixdeinterleave bpp, basereg
218.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
219 vuzp8 %(basereg+0), %(basereg+1)
220 vuzp8 %(basereg+2), %(basereg+3)
221 vuzp8 %(basereg+1), %(basereg+3)
222 vuzp8 %(basereg+0), %(basereg+2)
223.endif
224.endm
225
226/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
227.macro pixinterleave bpp, basereg
228.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
229 vzip8 %(basereg+0), %(basereg+2)
230 vzip8 %(basereg+1), %(basereg+3)
231 vzip8 %(basereg+2), %(basereg+3)
232 vzip8 %(basereg+0), %(basereg+1)
233.endif
234.endm
235
236/*
237 * This is a macro for implementing cache preload. The main idea is that
238 * cache preload logic is mostly independent from the rest of pixels
239 * processing code. It starts at the top left pixel and moves forward
240 * across pixels and can jump across scanlines. Prefetch distance is
241 * handled in an 'incremental' way: it starts from 0 and advances to the
242 * optimal distance over time. After reaching optimal prefetch distance,
243 * it is kept constant. There are some checks which prevent prefetching
244 * unneeded pixel lines below the image (but it still can prefetch a bit
245 * more data on the right side of the image - not a big issue and may
246 * be actually helpful when rendering text glyphs). Additional trick is
247 * the use of LDR instruction for prefetch instead of PLD when moving to
248 * the next line, the point is that we have a high chance of getting TLB
249 * miss in this case, and PLD would be useless.
250 *
251 * This sounds like it may introduce a noticeable overhead (when working with
252 * fully cached data). But in reality, due to having a separate pipeline and
253 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
254 * execute simultaneously with NEON and be completely shadowed by it. Thus
255 * we get no performance overhead at all (*). This looks like a very nice
256 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
257 * but still can implement some rather advanced prefetch logic in sofware
258 * for almost zero cost!
259 *
260 * (*) The overhead of the prefetcher is visible when running some trivial
261 * pixels processing like simple copy. Anyway, having prefetch is a must
262 * when working with the graphics data.
263 */
264.macro PF a, x:vararg
265.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
266 a x
267.endif
268.endm
269
270.macro cache_preload std_increment, boost_increment
271.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
272.if regs_shortage
273 PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
274.endif
275.if std_increment != 0
276 PF add PF_X, PF_X, #std_increment
277.endif
278 PF tst PF_CTL, #0xF
279 PF addne PF_X, PF_X, #boost_increment
280 PF subne PF_CTL, PF_CTL, #1
281 PF cmp PF_X, ORIG_W
282.if src_bpp_shift >= 0
283 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
284.endif
285.if dst_r_bpp != 0
286 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
287.endif
288.if mask_bpp_shift >= 0
289 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
290.endif
291 PF subge PF_X, PF_X, ORIG_W
292 PF subges PF_CTL, PF_CTL, #0x10
293.if src_bpp_shift >= 0
294 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
295.endif
296.if dst_r_bpp != 0
297 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
298.endif
299.if mask_bpp_shift >= 0
300 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
301.endif
302.endif
303.endm
304
305.macro cache_preload_simple
306.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
307.if src_bpp > 0
308 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
309.endif
310.if dst_r_bpp > 0
311 pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
312.endif
313.if mask_bpp > 0
314 pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
315.endif
316.endif
317.endm
318
319/*
320 * Macro which is used to process leading pixels until destination
321 * pointer is properly aligned (at 16 bytes boundary). When destination
322 * buffer uses 16bpp format, this is unnecessary, or even pointless.
323 */
324.macro ensure_destination_ptr_alignment process_pixblock_head, \
325 process_pixblock_tail, \
326 process_pixblock_tail_head
327.if dst_w_bpp != 24
328 tst DST_R, #0xF
329 beq 2f
330
331.irp lowbit, 1, 2, 4, 8, 16
332local skip1
333.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
334.if lowbit < 16 /* we don't need more than 16-byte alignment */
335 tst DST_R, #lowbit
336 beq 1f
337.endif
338 pixld (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
339 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
340.if dst_r_bpp > 0
341 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
342.else
343 add DST_R, DST_R, #lowbit
344.endif
345 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
346 sub W, W, #(lowbit * 8 / dst_w_bpp)
3471:
348.endif
349.endr
350 pixdeinterleave src_bpp, src_basereg
351 pixdeinterleave mask_bpp, mask_basereg
352 pixdeinterleave dst_r_bpp, dst_r_basereg
353
354 process_pixblock_head
355 cache_preload 0, pixblock_size
356 cache_preload_simple
357 process_pixblock_tail
358
359 pixinterleave dst_w_bpp, dst_w_basereg
360.irp lowbit, 1, 2, 4, 8, 16
361.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
362.if lowbit < 16 /* we don't need more than 16-byte alignment */
363 tst DST_W, #lowbit
364 beq 1f
365.endif
366 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
3671:
368.endif
369.endr
370.endif
3712:
372.endm
373
374/*
375 * Special code for processing up to (pixblock_size - 1) remaining
376 * trailing pixels. As SIMD processing performs operation on
377 * pixblock_size pixels, anything smaller than this has to be loaded
378 * and stored in a special way. Loading and storing of pixel data is
379 * performed in such a way that we fill some 'slots' in the NEON
380 * registers (some slots naturally are unused), then perform compositing
381 * operation as usual. In the end, the data is taken from these 'slots'
382 * and saved to memory.
383 *
384 * cache_preload_flag - allows to suppress prefetch if
385 * set to 0
386 * dst_aligned_flag - selects whether destination buffer
387 * is aligned
388 */
389.macro process_trailing_pixels cache_preload_flag, \
390 dst_aligned_flag, \
391 process_pixblock_head, \
392 process_pixblock_tail, \
393 process_pixblock_tail_head
394 tst W, #(pixblock_size - 1)
395 beq 2f
396.irp chunk_size, 16, 8, 4, 2, 1
397.if pixblock_size > chunk_size
398 tst W, #chunk_size
399 beq 1f
400 pixld chunk_size, src_bpp, src_basereg, SRC
401 pixld chunk_size, mask_bpp, mask_basereg, MASK
402.if dst_aligned_flag != 0
403 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
404.else
405 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
406.endif
407.if cache_preload_flag != 0
408 PF add PF_X, PF_X, #chunk_size
409.endif
4101:
411.endif
412.endr
413 pixdeinterleave src_bpp, src_basereg
414 pixdeinterleave mask_bpp, mask_basereg
415 pixdeinterleave dst_r_bpp, dst_r_basereg
416
417 process_pixblock_head
418.if cache_preload_flag != 0
419 cache_preload 0, pixblock_size
420 cache_preload_simple
421.endif
422 process_pixblock_tail
423 pixinterleave dst_w_bpp, dst_w_basereg
424.irp chunk_size, 16, 8, 4, 2, 1
425.if pixblock_size > chunk_size
426 tst W, #chunk_size
427 beq 1f
428.if dst_aligned_flag != 0
429 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
430.else
431 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
432.endif
4331:
434.endif
435.endr
4362:
437.endm
438
439/*
440 * Macro, which performs all the needed operations to switch to the next
441 * scanline and start the next loop iteration unless all the scanlines
442 * are already processed.
443 */
444.macro advance_to_next_scanline start_of_loop_label
445.if regs_shortage
446 ldrd W, [sp] /* load W and H (width and height) from stack */
447.else
448 mov W, ORIG_W
449.endif
450 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
451.if src_bpp != 0
452 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
453.endif
454.if mask_bpp != 0
455 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
456.endif
457.if (dst_w_bpp != 24)
458 sub DST_W, DST_W, W, lsl #dst_bpp_shift
459.endif
460.if (src_bpp != 24) && (src_bpp != 0)
461 sub SRC, SRC, W, lsl #src_bpp_shift
462.endif
463.if (mask_bpp != 24) && (mask_bpp != 0)
464 sub MASK, MASK, W, lsl #mask_bpp_shift
465.endif
466 subs H, H, #1
467 mov DST_R, DST_W
468.if regs_shortage
469 str H, [sp, #4] /* save updated height to stack */
470.endif
471 bge start_of_loop_label
472.endm
473
474/*
475 * Registers are allocated in the following way by default:
476 * d0, d1, d2, d3 - reserved for loading source pixel data
477 * d4, d5, d6, d7 - reserved for loading destination pixel data
478 * d24, d25, d26, d27 - reserved for loading mask pixel data
479 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
480 */
481.macro generate_composite_function fname, \
482 src_bpp_, \
483 mask_bpp_, \
484 dst_w_bpp_, \
485 flags, \
486 pixblock_size_, \
487 prefetch_distance, \
488 init, \
489 cleanup, \
490 process_pixblock_head, \
491 process_pixblock_tail, \
492 process_pixblock_tail_head, \
493 dst_w_basereg_ = 28, \
494 dst_r_basereg_ = 4, \
495 src_basereg_ = 0, \
496 mask_basereg_ = 24
497
498 .func fname
499 .global fname
500 /* For ELF format also set function visibility to hidden */
501#ifdef __ELF__
502 .hidden fname
503 .type fname, %function
504#endif
505fname:
506 push {r4-r12, lr} /* save all registers */
507
508/*
509 * Select prefetch type for this function. If prefetch distance is
510 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
511 * has to be used instead of ADVANCED.
512 */
513 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
514.if prefetch_distance == 0
515 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
516.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
517 ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
518 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
519.endif
520
521/*
522 * Make some macro arguments globally visible and accessible
523 * from other macros
524 */
525 .set src_bpp, src_bpp_
526 .set mask_bpp, mask_bpp_
527 .set dst_w_bpp, dst_w_bpp_
528 .set pixblock_size, pixblock_size_
529 .set dst_w_basereg, dst_w_basereg_
530 .set dst_r_basereg, dst_r_basereg_
531 .set src_basereg, src_basereg_
532 .set mask_basereg, mask_basereg_
533
534/*
535 * Assign symbolic names to registers
536 */
537 W .req r0 /* width (is updated during processing) */
538 H .req r1 /* height (is updated during processing) */
539 DST_W .req r2 /* destination buffer pointer for writes */
540 DST_STRIDE .req r3 /* destination image stride */
541 SRC .req r4 /* source buffer pointer */
542 SRC_STRIDE .req r5 /* source image stride */
543 DST_R .req r6 /* destination buffer pointer for reads */
544
545 MASK .req r7 /* mask pointer */
546 MASK_STRIDE .req r8 /* mask stride */
547
548 PF_CTL .req r9 /* combined lines counter and prefetch */
549 /* distance increment counter */
550 PF_X .req r10 /* pixel index in a scanline for current */
551 /* pretetch position */
552 PF_SRC .req r11 /* pointer to source scanline start */
553 /* for prefetch purposes */
554 PF_DST .req r12 /* pointer to destination scanline start */
555 /* for prefetch purposes */
556 PF_MASK .req r14 /* pointer to mask scanline start */
557 /* for prefetch purposes */
558/*
559 * Check whether we have enough registers for all the local variables.
560 * If we don't have enough registers, original width and height are
561 * kept on top of stack (and 'regs_shortage' variable is set to indicate
562 * this for the rest of code). Even if there are enough registers, the
563 * allocation scheme may be a bit different depending on whether source
564 * or mask is not used.
565 */
566.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
567 ORIG_W .req r10 /* saved original width */
568 DUMMY .req r12 /* temporary register */
569 .set regs_shortage, 0
570.elseif mask_bpp == 0
571 ORIG_W .req r7 /* saved original width */
572 DUMMY .req r8 /* temporary register */
573 .set regs_shortage, 0
574.elseif src_bpp == 0
575 ORIG_W .req r4 /* saved original width */
576 DUMMY .req r5 /* temporary register */
577 .set regs_shortage, 0
578.else
579 ORIG_W .req r1 /* saved original width */
580 DUMMY .req r1 /* temporary register */
581 .set regs_shortage, 1
582.endif
583
584 .set mask_bpp_shift, -1
585.if src_bpp == 32
586 .set src_bpp_shift, 2
587.elseif src_bpp == 24
588 .set src_bpp_shift, 0
589.elseif src_bpp == 16
590 .set src_bpp_shift, 1
591.elseif src_bpp == 8
592 .set src_bpp_shift, 0
593.elseif src_bpp == 0
594 .set src_bpp_shift, -1
595.else
596 .error "requested src bpp (src_bpp) is not supported"
597.endif
598.if mask_bpp == 32
599 .set mask_bpp_shift, 2
600.elseif mask_bpp == 24
601 .set mask_bpp_shift, 0
602.elseif mask_bpp == 8
603 .set mask_bpp_shift, 0
604.elseif mask_bpp == 0
605 .set mask_bpp_shift, -1
606.else
607 .error "requested mask bpp (mask_bpp) is not supported"
608.endif
609.if dst_w_bpp == 32
610 .set dst_bpp_shift, 2
611.elseif dst_w_bpp == 24
612 .set dst_bpp_shift, 0
613.elseif dst_w_bpp == 16
614 .set dst_bpp_shift, 1
615.elseif dst_w_bpp == 8
616 .set dst_bpp_shift, 0
617.else
618 .error "requested dst bpp (dst_w_bpp) is not supported"
619.endif
620
621.if (((flags) & FLAG_DST_READWRITE) != 0)
622 .set dst_r_bpp, dst_w_bpp
623.else
624 .set dst_r_bpp, 0
625.endif
626.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
627 .set DEINTERLEAVE_32BPP_ENABLED, 1
628.else
629 .set DEINTERLEAVE_32BPP_ENABLED, 0
630.endif
631
632.if prefetch_distance < 0 || prefetch_distance > 15
633 .error "invalid prefetch distance (prefetch_distance)"
634.endif
635
636.if src_bpp > 0
637 ldr SRC, [sp, #40]
638.endif
639.if mask_bpp > 0
640 ldr MASK, [sp, #48]
641.endif
642 PF mov PF_X, #0
643.if src_bpp > 0
644 ldr SRC_STRIDE, [sp, #44]
645.endif
646.if mask_bpp > 0
647 ldr MASK_STRIDE, [sp, #52]
648.endif
649 mov DST_R, DST_W
650
651.if src_bpp == 24
652 sub SRC_STRIDE, SRC_STRIDE, W
653 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
654.endif
655.if mask_bpp == 24
656 sub MASK_STRIDE, MASK_STRIDE, W
657 sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
658.endif
659.if dst_w_bpp == 24
660 sub DST_STRIDE, DST_STRIDE, W
661 sub DST_STRIDE, DST_STRIDE, W, lsl #1
662.endif
663
664/*
665 * Setup advanced prefetcher initial state
666 */
667 PF mov PF_SRC, SRC
668 PF mov PF_DST, DST_R
669 PF mov PF_MASK, MASK
670 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
671 PF mov PF_CTL, H, lsl #4
672 PF add PF_CTL, #(prefetch_distance - 0x10)
673
674 init
675.if regs_shortage
676 push {r0, r1}
677.endif
678 subs H, H, #1
679.if regs_shortage
680 str H, [sp, #4] /* save updated height to stack */
681.else
682 mov ORIG_W, W
683.endif
684 blt 9f
685 cmp W, #(pixblock_size * 2)
686 blt 8f
687/*
688 * This is the start of the pipelined loop, which if optimized for
689 * long scanlines
690 */
6910:
692 ensure_destination_ptr_alignment process_pixblock_head, \
693 process_pixblock_tail, \
694 process_pixblock_tail_head
695
696 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
697 pixld_a pixblock_size, dst_r_bpp, \
698 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
699 pixld pixblock_size, src_bpp, \
700 (src_basereg - pixblock_size * src_bpp / 64), SRC
701 pixld pixblock_size, mask_bpp, \
702 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
703 PF add PF_X, PF_X, #pixblock_size
704 process_pixblock_head
705 cache_preload 0, pixblock_size
706 cache_preload_simple
707 subs W, W, #(pixblock_size * 2)
708 blt 2f
7091:
710 process_pixblock_tail_head
711 cache_preload_simple
712 subs W, W, #pixblock_size
713 bge 1b
7142:
715 process_pixblock_tail
716 pixst_a pixblock_size, dst_w_bpp, \
717 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
718
719 /* Process the remaining trailing pixels in the scanline */
720 process_trailing_pixels 1, 1, \
721 process_pixblock_head, \
722 process_pixblock_tail, \
723 process_pixblock_tail_head
724 advance_to_next_scanline 0b
725
726.if regs_shortage
727 pop {r0, r1}
728.endif
729 cleanup
730 pop {r4-r12, pc} /* exit */
731/*
732 * This is the start of the loop, designed to process images with small width
733 * (less than pixblock_size * 2 pixels). In this case neither pipelining
734 * nor prefetch are used.
735 */
7368:
737 /* Process exactly pixblock_size pixels if needed */
738 tst W, #pixblock_size
739 beq 1f
740 pixld pixblock_size, dst_r_bpp, \
741 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
742 pixld pixblock_size, src_bpp, \
743 (src_basereg - pixblock_size * src_bpp / 64), SRC
744 pixld pixblock_size, mask_bpp, \
745 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
746 process_pixblock_head
747 process_pixblock_tail
748 pixst pixblock_size, dst_w_bpp, \
749 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
7501:
751 /* Process the remaining trailing pixels in the scanline */
752 process_trailing_pixels 0, 0, \
753 process_pixblock_head, \
754 process_pixblock_tail, \
755 process_pixblock_tail_head
756 advance_to_next_scanline 8b
7579:
758.if regs_shortage
759 pop {r0, r1}
760.endif
761 cleanup
762 pop {r4-r12, pc} /* exit */
763
764 .unreq SRC
765 .unreq MASK
766 .unreq DST_R
767 .unreq DST_W
768 .unreq ORIG_W
769 .unreq W
770 .unreq H
771 .unreq SRC_STRIDE
772 .unreq DST_STRIDE
773 .unreq MASK_STRIDE
774 .unreq PF_CTL
775 .unreq PF_X
776 .unreq PF_SRC
777 .unreq PF_DST
778 .unreq PF_MASK
779 .unreq DUMMY
780 .endfunc
781.endm
782
783/*
784 * A simplified variant of function generation template for a single
785 * scanline processing (for implementing pixman combine functions)
786 */
787.macro generate_composite_function_single_scanline fname, \
788 src_bpp_, \
789 mask_bpp_, \
790 dst_w_bpp_, \
791 flags, \
792 pixblock_size_, \
793 init, \
794 cleanup, \
795 process_pixblock_head, \
796 process_pixblock_tail, \
797 process_pixblock_tail_head, \
798 dst_w_basereg_ = 28, \
799 dst_r_basereg_ = 4, \
800 src_basereg_ = 0, \
801 mask_basereg_ = 24
802
803 .func fname
804 .global fname
805 /* For ELF format also set function visibility to hidden */
806#ifdef __ELF__
807 .hidden fname
808 .type fname, %function
809#endif
810fname:
811 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
812/*
813 * Make some macro arguments globally visible and accessible
814 * from other macros
815 */
816 .set src_bpp, src_bpp_
817 .set mask_bpp, mask_bpp_
818 .set dst_w_bpp, dst_w_bpp_
819 .set pixblock_size, pixblock_size_
820 .set dst_w_basereg, dst_w_basereg_
821 .set dst_r_basereg, dst_r_basereg_
822 .set src_basereg, src_basereg_
823 .set mask_basereg, mask_basereg_
824/*
825 * Assign symbolic names to registers
826 */
827 W .req r0 /* width (is updated during processing) */
828 DST_W .req r1 /* destination buffer pointer for writes */
829 SRC .req r2 /* source buffer pointer */
830 DST_R .req ip /* destination buffer pointer for reads */
831 MASK .req r3 /* mask pointer */
832
833.if (((flags) & FLAG_DST_READWRITE) != 0)
834 .set dst_r_bpp, dst_w_bpp
835.else
836 .set dst_r_bpp, 0
837.endif
838.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
839 .set DEINTERLEAVE_32BPP_ENABLED, 1
840.else
841 .set DEINTERLEAVE_32BPP_ENABLED, 0
842.endif
843
844 init
845 mov DST_R, DST_W
846
847 cmp W, #pixblock_size
848 blt 8f
849
850 ensure_destination_ptr_alignment process_pixblock_head, \
851 process_pixblock_tail, \
852 process_pixblock_tail_head
853
854 subs W, W, #pixblock_size
855 blt 7f
856
857 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
858 pixld_a pixblock_size, dst_r_bpp, \
859 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
860 pixld pixblock_size, src_bpp, \
861 (src_basereg - pixblock_size * src_bpp / 64), SRC
862 pixld pixblock_size, mask_bpp, \
863 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
864 process_pixblock_head
865 subs W, W, #pixblock_size
866 blt 2f
8671:
868 process_pixblock_tail_head
869 subs W, W, #pixblock_size
870 bge 1b
8712:
872 process_pixblock_tail
873 pixst_a pixblock_size, dst_w_bpp, \
874 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
8757:
876 /* Process the remaining trailing pixels in the scanline (dst aligned) */
877 process_trailing_pixels 0, 1, \
878 process_pixblock_head, \
879 process_pixblock_tail, \
880 process_pixblock_tail_head
881
882 cleanup
883 bx lr /* exit */
8848:
885 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
886 process_trailing_pixels 0, 0, \
887 process_pixblock_head, \
888 process_pixblock_tail, \
889 process_pixblock_tail_head
890
891 cleanup
892 bx lr /* exit */
893
894 .unreq SRC
895 .unreq MASK
896 .unreq DST_R
897 .unreq DST_W
898 .unreq W
899 .endfunc
900.endm
901
902.macro default_init
903.endm
904
905.macro default_cleanup
906.endm
Note: See TracBrowser for help on using the repository browser.