1 | /*
|
---|
2 | * Copyright © 2009 Nokia Corporation
|
---|
3 | *
|
---|
4 | * Permission is hereby granted, free of charge, to any person obtaining a
|
---|
5 | * copy of this software and associated documentation files (the "Software"),
|
---|
6 | * to deal in the Software without restriction, including without limitation
|
---|
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
---|
8 | * and/or sell copies of the Software, and to permit persons to whom the
|
---|
9 | * Software is furnished to do so, subject to the following conditions:
|
---|
10 | *
|
---|
11 | * The above copyright notice and this permission notice (including the next
|
---|
12 | * paragraph) shall be included in all copies or substantial portions of the
|
---|
13 | * Software.
|
---|
14 | *
|
---|
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
---|
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
---|
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
---|
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
---|
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
---|
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
---|
21 | * DEALINGS IN THE SOFTWARE.
|
---|
22 | *
|
---|
23 | * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
|
---|
24 | */
|
---|
25 |
|
---|
26 | /*
|
---|
27 | * This file contains a macro ('generate_composite_function') which can
|
---|
28 | * construct 2D image processing functions, based on a common template.
|
---|
29 | * Any combinations of source, destination and mask images with 8bpp,
|
---|
30 | * 16bpp, 24bpp, 32bpp color formats are supported.
|
---|
31 | *
|
---|
32 | * This macro takes care of:
|
---|
33 | * - handling of leading and trailing unaligned pixels
|
---|
34 | * - doing most of the work related to L2 cache preload
|
---|
35 | * - encourages the use of software pipelining for better instructions
|
---|
36 | * scheduling
|
---|
37 | *
|
---|
38 | * The user of this macro has to provide some configuration parameters
|
---|
39 | * (bit depths for the images, prefetch distance, etc.) and a set of
|
---|
40 | * macros, which should implement basic code chunks responsible for
|
---|
41 | * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
|
---|
42 | * examples.
|
---|
43 | *
|
---|
44 | * TODO:
|
---|
45 | * - try overlapped pixel method (from Ian Rickards) when processing
|
---|
46 | * exactly two blocks of pixels
|
---|
47 | * - maybe add an option to do reverse scanline processing
|
---|
48 | */
|
---|
49 |
|
---|
50 | /*
|
---|
51 | * Bit flags for 'generate_composite_function' macro which are used
|
---|
52 | * to tune generated functions behavior.
|
---|
53 | */
|
---|
54 | .set FLAG_DST_WRITEONLY, 0
|
---|
55 | .set FLAG_DST_READWRITE, 1
|
---|
56 | .set FLAG_DEINTERLEAVE_32BPP, 2
|
---|
57 |
|
---|
58 | /*
|
---|
59 | * Offset in stack where mask and source pointer/stride can be accessed
|
---|
60 | * from 'init' macro. This is useful for doing special handling for solid mask.
|
---|
61 | */
|
---|
62 | .set ARGS_STACK_OFFSET, 40
|
---|
63 |
|
---|
64 | /*
|
---|
65 | * Constants for selecting preferable prefetch type.
|
---|
66 | */
|
---|
67 | .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
|
---|
68 | .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
|
---|
69 | .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
|
---|
70 |
|
---|
71 | /*
|
---|
72 | * Definitions of supplementary pixld/pixst macros (for partial load/store of
|
---|
73 | * pixel data).
|
---|
74 | */
|
---|
75 |
|
---|
76 | .macro pixldst1 op, elem_size, reg1, mem_operand, abits
|
---|
77 | .if abits > 0
|
---|
78 | op&.&elem_size {d®1}, [&mem_operand&, :&abits&]!
|
---|
79 | .else
|
---|
80 | op&.&elem_size {d®1}, [&mem_operand&]!
|
---|
81 | .endif
|
---|
82 | .endm
|
---|
83 |
|
---|
84 | .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
|
---|
85 | .if abits > 0
|
---|
86 | op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]!
|
---|
87 | .else
|
---|
88 | op&.&elem_size {d®1, d®2}, [&mem_operand&]!
|
---|
89 | .endif
|
---|
90 | .endm
|
---|
91 |
|
---|
92 | .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
|
---|
93 | .if abits > 0
|
---|
94 | op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]!
|
---|
95 | .else
|
---|
96 | op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]!
|
---|
97 | .endif
|
---|
98 | .endm
|
---|
99 |
|
---|
100 | .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
|
---|
101 | op&.&elem_size {d®1[idx]}, [&mem_operand&]!
|
---|
102 | .endm
|
---|
103 |
|
---|
104 | .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
|
---|
105 | op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]!
|
---|
106 | .endm
|
---|
107 |
|
---|
108 | .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
|
---|
109 | op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]!
|
---|
110 | .endm
|
---|
111 |
|
---|
112 | .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
|
---|
113 | .if numbytes == 32
|
---|
114 | pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
|
---|
115 | %(basereg+6), %(basereg+7), mem_operand, abits
|
---|
116 | .elseif numbytes == 16
|
---|
117 | pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
|
---|
118 | .elseif numbytes == 8
|
---|
119 | pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
|
---|
120 | .elseif numbytes == 4
|
---|
121 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
|
---|
122 | pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
|
---|
123 | .elseif elem_size == 16
|
---|
124 | pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
|
---|
125 | pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
|
---|
126 | .else
|
---|
127 | pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
|
---|
128 | pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
|
---|
129 | pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
|
---|
130 | pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
|
---|
131 | .endif
|
---|
132 | .elseif numbytes == 2
|
---|
133 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
|
---|
134 | pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
|
---|
135 | .else
|
---|
136 | pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
|
---|
137 | pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
|
---|
138 | .endif
|
---|
139 | .elseif numbytes == 1
|
---|
140 | pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
|
---|
141 | .else
|
---|
142 | .error "unsupported size: numbytes"
|
---|
143 | .endif
|
---|
144 | .endm
|
---|
145 |
|
---|
146 | .macro pixld numpix, bpp, basereg, mem_operand, abits=0
|
---|
147 | .if bpp > 0
|
---|
148 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
|
---|
149 | pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
|
---|
150 | %(basereg+6), %(basereg+7), mem_operand, abits
|
---|
151 | .elseif (bpp == 24) && (numpix == 8)
|
---|
152 | pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
|
---|
153 | .elseif (bpp == 24) && (numpix == 4)
|
---|
154 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
|
---|
155 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
|
---|
156 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
|
---|
157 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
|
---|
158 | .elseif (bpp == 24) && (numpix == 2)
|
---|
159 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
|
---|
160 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
|
---|
161 | .elseif (bpp == 24) && (numpix == 1)
|
---|
162 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
|
---|
163 | .else
|
---|
164 | pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
|
---|
165 | .endif
|
---|
166 | .endif
|
---|
167 | .endm
|
---|
168 |
|
---|
169 | .macro pixst numpix, bpp, basereg, mem_operand, abits=0
|
---|
170 | .if bpp > 0
|
---|
171 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
|
---|
172 | pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
|
---|
173 | %(basereg+6), %(basereg+7), mem_operand, abits
|
---|
174 | .elseif (bpp == 24) && (numpix == 8)
|
---|
175 | pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
|
---|
176 | .elseif (bpp == 24) && (numpix == 4)
|
---|
177 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
|
---|
178 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
|
---|
179 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
|
---|
180 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
|
---|
181 | .elseif (bpp == 24) && (numpix == 2)
|
---|
182 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
|
---|
183 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
|
---|
184 | .elseif (bpp == 24) && (numpix == 1)
|
---|
185 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
|
---|
186 | .else
|
---|
187 | pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
|
---|
188 | .endif
|
---|
189 | .endif
|
---|
190 | .endm
|
---|
191 |
|
---|
192 | .macro pixld_a numpix, bpp, basereg, mem_operand
|
---|
193 | .if (bpp * numpix) <= 128
|
---|
194 | pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
|
---|
195 | .else
|
---|
196 | pixld numpix, bpp, basereg, mem_operand, 128
|
---|
197 | .endif
|
---|
198 | .endm
|
---|
199 |
|
---|
200 | .macro pixst_a numpix, bpp, basereg, mem_operand
|
---|
201 | .if (bpp * numpix) <= 128
|
---|
202 | pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
|
---|
203 | .else
|
---|
204 | pixst numpix, bpp, basereg, mem_operand, 128
|
---|
205 | .endif
|
---|
206 | .endm
|
---|
207 |
|
---|
208 | .macro vuzp8 reg1, reg2
|
---|
209 | vuzp.8 d®1, d®2
|
---|
210 | .endm
|
---|
211 |
|
---|
212 | .macro vzip8 reg1, reg2
|
---|
213 | vzip.8 d®1, d®2
|
---|
214 | .endm
|
---|
215 |
|
---|
216 | /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
|
---|
217 | .macro pixdeinterleave bpp, basereg
|
---|
218 | .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
|
---|
219 | vuzp8 %(basereg+0), %(basereg+1)
|
---|
220 | vuzp8 %(basereg+2), %(basereg+3)
|
---|
221 | vuzp8 %(basereg+1), %(basereg+3)
|
---|
222 | vuzp8 %(basereg+0), %(basereg+2)
|
---|
223 | .endif
|
---|
224 | .endm
|
---|
225 |
|
---|
226 | /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
|
---|
227 | .macro pixinterleave bpp, basereg
|
---|
228 | .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
|
---|
229 | vzip8 %(basereg+0), %(basereg+2)
|
---|
230 | vzip8 %(basereg+1), %(basereg+3)
|
---|
231 | vzip8 %(basereg+2), %(basereg+3)
|
---|
232 | vzip8 %(basereg+0), %(basereg+1)
|
---|
233 | .endif
|
---|
234 | .endm
|
---|
235 |
|
---|
236 | /*
|
---|
237 | * This is a macro for implementing cache preload. The main idea is that
|
---|
238 | * cache preload logic is mostly independent from the rest of pixels
|
---|
239 | * processing code. It starts at the top left pixel and moves forward
|
---|
240 | * across pixels and can jump across scanlines. Prefetch distance is
|
---|
241 | * handled in an 'incremental' way: it starts from 0 and advances to the
|
---|
242 | * optimal distance over time. After reaching optimal prefetch distance,
|
---|
243 | * it is kept constant. There are some checks which prevent prefetching
|
---|
244 | * unneeded pixel lines below the image (but it still can prefetch a bit
|
---|
245 | * more data on the right side of the image - not a big issue and may
|
---|
246 | * be actually helpful when rendering text glyphs). Additional trick is
|
---|
247 | * the use of LDR instruction for prefetch instead of PLD when moving to
|
---|
248 | * the next line, the point is that we have a high chance of getting TLB
|
---|
249 | * miss in this case, and PLD would be useless.
|
---|
250 | *
|
---|
251 | * This sounds like it may introduce a noticeable overhead (when working with
|
---|
252 | * fully cached data). But in reality, due to having a separate pipeline and
|
---|
253 | * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
|
---|
254 | * execute simultaneously with NEON and be completely shadowed by it. Thus
|
---|
255 | * we get no performance overhead at all (*). This looks like a very nice
|
---|
256 | * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
|
---|
257 | * but still can implement some rather advanced prefetch logic in sofware
|
---|
258 | * for almost zero cost!
|
---|
259 | *
|
---|
260 | * (*) The overhead of the prefetcher is visible when running some trivial
|
---|
261 | * pixels processing like simple copy. Anyway, having prefetch is a must
|
---|
262 | * when working with the graphics data.
|
---|
263 | */
|
---|
264 | .macro PF a, x:vararg
|
---|
265 | .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
|
---|
266 | a x
|
---|
267 | .endif
|
---|
268 | .endm
|
---|
269 |
|
---|
270 | .macro cache_preload std_increment, boost_increment
|
---|
271 | .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
|
---|
272 | .if regs_shortage
|
---|
273 | PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
|
---|
274 | .endif
|
---|
275 | .if std_increment != 0
|
---|
276 | PF add PF_X, PF_X, #std_increment
|
---|
277 | .endif
|
---|
278 | PF tst PF_CTL, #0xF
|
---|
279 | PF addne PF_X, PF_X, #boost_increment
|
---|
280 | PF subne PF_CTL, PF_CTL, #1
|
---|
281 | PF cmp PF_X, ORIG_W
|
---|
282 | .if src_bpp_shift >= 0
|
---|
283 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
---|
284 | .endif
|
---|
285 | .if dst_r_bpp != 0
|
---|
286 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
---|
287 | .endif
|
---|
288 | .if mask_bpp_shift >= 0
|
---|
289 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
|
---|
290 | .endif
|
---|
291 | PF subge PF_X, PF_X, ORIG_W
|
---|
292 | PF subges PF_CTL, PF_CTL, #0x10
|
---|
293 | .if src_bpp_shift >= 0
|
---|
294 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
---|
295 | .endif
|
---|
296 | .if dst_r_bpp != 0
|
---|
297 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
---|
298 | .endif
|
---|
299 | .if mask_bpp_shift >= 0
|
---|
300 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
---|
301 | .endif
|
---|
302 | .endif
|
---|
303 | .endm
|
---|
304 |
|
---|
305 | .macro cache_preload_simple
|
---|
306 | .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
|
---|
307 | .if src_bpp > 0
|
---|
308 | pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
|
---|
309 | .endif
|
---|
310 | .if dst_r_bpp > 0
|
---|
311 | pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
|
---|
312 | .endif
|
---|
313 | .if mask_bpp > 0
|
---|
314 | pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
|
---|
315 | .endif
|
---|
316 | .endif
|
---|
317 | .endm
|
---|
318 |
|
---|
319 | /*
|
---|
320 | * Macro which is used to process leading pixels until destination
|
---|
321 | * pointer is properly aligned (at 16 bytes boundary). When destination
|
---|
322 | * buffer uses 16bpp format, this is unnecessary, or even pointless.
|
---|
323 | */
|
---|
324 | .macro ensure_destination_ptr_alignment process_pixblock_head, \
|
---|
325 | process_pixblock_tail, \
|
---|
326 | process_pixblock_tail_head
|
---|
327 | .if dst_w_bpp != 24
|
---|
328 | tst DST_R, #0xF
|
---|
329 | beq 2f
|
---|
330 |
|
---|
331 | .irp lowbit, 1, 2, 4, 8, 16
|
---|
332 | local skip1
|
---|
333 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
|
---|
334 | .if lowbit < 16 /* we don't need more than 16-byte alignment */
|
---|
335 | tst DST_R, #lowbit
|
---|
336 | beq 1f
|
---|
337 | .endif
|
---|
338 | pixld (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
|
---|
339 | pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
|
---|
340 | .if dst_r_bpp > 0
|
---|
341 | pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
|
---|
342 | .else
|
---|
343 | add DST_R, DST_R, #lowbit
|
---|
344 | .endif
|
---|
345 | PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
|
---|
346 | sub W, W, #(lowbit * 8 / dst_w_bpp)
|
---|
347 | 1:
|
---|
348 | .endif
|
---|
349 | .endr
|
---|
350 | pixdeinterleave src_bpp, src_basereg
|
---|
351 | pixdeinterleave mask_bpp, mask_basereg
|
---|
352 | pixdeinterleave dst_r_bpp, dst_r_basereg
|
---|
353 |
|
---|
354 | process_pixblock_head
|
---|
355 | cache_preload 0, pixblock_size
|
---|
356 | cache_preload_simple
|
---|
357 | process_pixblock_tail
|
---|
358 |
|
---|
359 | pixinterleave dst_w_bpp, dst_w_basereg
|
---|
360 | .irp lowbit, 1, 2, 4, 8, 16
|
---|
361 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
|
---|
362 | .if lowbit < 16 /* we don't need more than 16-byte alignment */
|
---|
363 | tst DST_W, #lowbit
|
---|
364 | beq 1f
|
---|
365 | .endif
|
---|
366 | pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
|
---|
367 | 1:
|
---|
368 | .endif
|
---|
369 | .endr
|
---|
370 | .endif
|
---|
371 | 2:
|
---|
372 | .endm
|
---|
373 |
|
---|
374 | /*
|
---|
375 | * Special code for processing up to (pixblock_size - 1) remaining
|
---|
376 | * trailing pixels. As SIMD processing performs operation on
|
---|
377 | * pixblock_size pixels, anything smaller than this has to be loaded
|
---|
378 | * and stored in a special way. Loading and storing of pixel data is
|
---|
379 | * performed in such a way that we fill some 'slots' in the NEON
|
---|
380 | * registers (some slots naturally are unused), then perform compositing
|
---|
381 | * operation as usual. In the end, the data is taken from these 'slots'
|
---|
382 | * and saved to memory.
|
---|
383 | *
|
---|
384 | * cache_preload_flag - allows to suppress prefetch if
|
---|
385 | * set to 0
|
---|
386 | * dst_aligned_flag - selects whether destination buffer
|
---|
387 | * is aligned
|
---|
388 | */
|
---|
389 | .macro process_trailing_pixels cache_preload_flag, \
|
---|
390 | dst_aligned_flag, \
|
---|
391 | process_pixblock_head, \
|
---|
392 | process_pixblock_tail, \
|
---|
393 | process_pixblock_tail_head
|
---|
394 | tst W, #(pixblock_size - 1)
|
---|
395 | beq 2f
|
---|
396 | .irp chunk_size, 16, 8, 4, 2, 1
|
---|
397 | .if pixblock_size > chunk_size
|
---|
398 | tst W, #chunk_size
|
---|
399 | beq 1f
|
---|
400 | pixld chunk_size, src_bpp, src_basereg, SRC
|
---|
401 | pixld chunk_size, mask_bpp, mask_basereg, MASK
|
---|
402 | .if dst_aligned_flag != 0
|
---|
403 | pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
|
---|
404 | .else
|
---|
405 | pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
|
---|
406 | .endif
|
---|
407 | .if cache_preload_flag != 0
|
---|
408 | PF add PF_X, PF_X, #chunk_size
|
---|
409 | .endif
|
---|
410 | 1:
|
---|
411 | .endif
|
---|
412 | .endr
|
---|
413 | pixdeinterleave src_bpp, src_basereg
|
---|
414 | pixdeinterleave mask_bpp, mask_basereg
|
---|
415 | pixdeinterleave dst_r_bpp, dst_r_basereg
|
---|
416 |
|
---|
417 | process_pixblock_head
|
---|
418 | .if cache_preload_flag != 0
|
---|
419 | cache_preload 0, pixblock_size
|
---|
420 | cache_preload_simple
|
---|
421 | .endif
|
---|
422 | process_pixblock_tail
|
---|
423 | pixinterleave dst_w_bpp, dst_w_basereg
|
---|
424 | .irp chunk_size, 16, 8, 4, 2, 1
|
---|
425 | .if pixblock_size > chunk_size
|
---|
426 | tst W, #chunk_size
|
---|
427 | beq 1f
|
---|
428 | .if dst_aligned_flag != 0
|
---|
429 | pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
|
---|
430 | .else
|
---|
431 | pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
|
---|
432 | .endif
|
---|
433 | 1:
|
---|
434 | .endif
|
---|
435 | .endr
|
---|
436 | 2:
|
---|
437 | .endm
|
---|
438 |
|
---|
439 | /*
|
---|
440 | * Macro, which performs all the needed operations to switch to the next
|
---|
441 | * scanline and start the next loop iteration unless all the scanlines
|
---|
442 | * are already processed.
|
---|
443 | */
|
---|
444 | .macro advance_to_next_scanline start_of_loop_label
|
---|
445 | .if regs_shortage
|
---|
446 | ldrd W, [sp] /* load W and H (width and height) from stack */
|
---|
447 | .else
|
---|
448 | mov W, ORIG_W
|
---|
449 | .endif
|
---|
450 | add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
|
---|
451 | .if src_bpp != 0
|
---|
452 | add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
|
---|
453 | .endif
|
---|
454 | .if mask_bpp != 0
|
---|
455 | add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
|
---|
456 | .endif
|
---|
457 | .if (dst_w_bpp != 24)
|
---|
458 | sub DST_W, DST_W, W, lsl #dst_bpp_shift
|
---|
459 | .endif
|
---|
460 | .if (src_bpp != 24) && (src_bpp != 0)
|
---|
461 | sub SRC, SRC, W, lsl #src_bpp_shift
|
---|
462 | .endif
|
---|
463 | .if (mask_bpp != 24) && (mask_bpp != 0)
|
---|
464 | sub MASK, MASK, W, lsl #mask_bpp_shift
|
---|
465 | .endif
|
---|
466 | subs H, H, #1
|
---|
467 | mov DST_R, DST_W
|
---|
468 | .if regs_shortage
|
---|
469 | str H, [sp, #4] /* save updated height to stack */
|
---|
470 | .endif
|
---|
471 | bge start_of_loop_label
|
---|
472 | .endm
|
---|
473 |
|
---|
474 | /*
|
---|
475 | * Registers are allocated in the following way by default:
|
---|
476 | * d0, d1, d2, d3 - reserved for loading source pixel data
|
---|
477 | * d4, d5, d6, d7 - reserved for loading destination pixel data
|
---|
478 | * d24, d25, d26, d27 - reserved for loading mask pixel data
|
---|
479 | * d28, d29, d30, d31 - final destination pixel data for writeback to memory
|
---|
480 | */
|
---|
481 | .macro generate_composite_function fname, \
|
---|
482 | src_bpp_, \
|
---|
483 | mask_bpp_, \
|
---|
484 | dst_w_bpp_, \
|
---|
485 | flags, \
|
---|
486 | pixblock_size_, \
|
---|
487 | prefetch_distance, \
|
---|
488 | init, \
|
---|
489 | cleanup, \
|
---|
490 | process_pixblock_head, \
|
---|
491 | process_pixblock_tail, \
|
---|
492 | process_pixblock_tail_head, \
|
---|
493 | dst_w_basereg_ = 28, \
|
---|
494 | dst_r_basereg_ = 4, \
|
---|
495 | src_basereg_ = 0, \
|
---|
496 | mask_basereg_ = 24
|
---|
497 |
|
---|
498 | .func fname
|
---|
499 | .global fname
|
---|
500 | /* For ELF format also set function visibility to hidden */
|
---|
501 | #ifdef __ELF__
|
---|
502 | .hidden fname
|
---|
503 | .type fname, %function
|
---|
504 | #endif
|
---|
505 | fname:
|
---|
506 | push {r4-r12, lr} /* save all registers */
|
---|
507 |
|
---|
508 | /*
|
---|
509 | * Select prefetch type for this function. If prefetch distance is
|
---|
510 | * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
|
---|
511 | * has to be used instead of ADVANCED.
|
---|
512 | */
|
---|
513 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
|
---|
514 | .if prefetch_distance == 0
|
---|
515 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
|
---|
516 | .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
|
---|
517 | ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
|
---|
518 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
|
---|
519 | .endif
|
---|
520 |
|
---|
521 | /*
|
---|
522 | * Make some macro arguments globally visible and accessible
|
---|
523 | * from other macros
|
---|
524 | */
|
---|
525 | .set src_bpp, src_bpp_
|
---|
526 | .set mask_bpp, mask_bpp_
|
---|
527 | .set dst_w_bpp, dst_w_bpp_
|
---|
528 | .set pixblock_size, pixblock_size_
|
---|
529 | .set dst_w_basereg, dst_w_basereg_
|
---|
530 | .set dst_r_basereg, dst_r_basereg_
|
---|
531 | .set src_basereg, src_basereg_
|
---|
532 | .set mask_basereg, mask_basereg_
|
---|
533 |
|
---|
534 | /*
|
---|
535 | * Assign symbolic names to registers
|
---|
536 | */
|
---|
537 | W .req r0 /* width (is updated during processing) */
|
---|
538 | H .req r1 /* height (is updated during processing) */
|
---|
539 | DST_W .req r2 /* destination buffer pointer for writes */
|
---|
540 | DST_STRIDE .req r3 /* destination image stride */
|
---|
541 | SRC .req r4 /* source buffer pointer */
|
---|
542 | SRC_STRIDE .req r5 /* source image stride */
|
---|
543 | DST_R .req r6 /* destination buffer pointer for reads */
|
---|
544 |
|
---|
545 | MASK .req r7 /* mask pointer */
|
---|
546 | MASK_STRIDE .req r8 /* mask stride */
|
---|
547 |
|
---|
548 | PF_CTL .req r9 /* combined lines counter and prefetch */
|
---|
549 | /* distance increment counter */
|
---|
550 | PF_X .req r10 /* pixel index in a scanline for current */
|
---|
551 | /* pretetch position */
|
---|
552 | PF_SRC .req r11 /* pointer to source scanline start */
|
---|
553 | /* for prefetch purposes */
|
---|
554 | PF_DST .req r12 /* pointer to destination scanline start */
|
---|
555 | /* for prefetch purposes */
|
---|
556 | PF_MASK .req r14 /* pointer to mask scanline start */
|
---|
557 | /* for prefetch purposes */
|
---|
558 | /*
|
---|
559 | * Check whether we have enough registers for all the local variables.
|
---|
560 | * If we don't have enough registers, original width and height are
|
---|
561 | * kept on top of stack (and 'regs_shortage' variable is set to indicate
|
---|
562 | * this for the rest of code). Even if there are enough registers, the
|
---|
563 | * allocation scheme may be a bit different depending on whether source
|
---|
564 | * or mask is not used.
|
---|
565 | */
|
---|
566 | .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
|
---|
567 | ORIG_W .req r10 /* saved original width */
|
---|
568 | DUMMY .req r12 /* temporary register */
|
---|
569 | .set regs_shortage, 0
|
---|
570 | .elseif mask_bpp == 0
|
---|
571 | ORIG_W .req r7 /* saved original width */
|
---|
572 | DUMMY .req r8 /* temporary register */
|
---|
573 | .set regs_shortage, 0
|
---|
574 | .elseif src_bpp == 0
|
---|
575 | ORIG_W .req r4 /* saved original width */
|
---|
576 | DUMMY .req r5 /* temporary register */
|
---|
577 | .set regs_shortage, 0
|
---|
578 | .else
|
---|
579 | ORIG_W .req r1 /* saved original width */
|
---|
580 | DUMMY .req r1 /* temporary register */
|
---|
581 | .set regs_shortage, 1
|
---|
582 | .endif
|
---|
583 |
|
---|
584 | .set mask_bpp_shift, -1
|
---|
585 | .if src_bpp == 32
|
---|
586 | .set src_bpp_shift, 2
|
---|
587 | .elseif src_bpp == 24
|
---|
588 | .set src_bpp_shift, 0
|
---|
589 | .elseif src_bpp == 16
|
---|
590 | .set src_bpp_shift, 1
|
---|
591 | .elseif src_bpp == 8
|
---|
592 | .set src_bpp_shift, 0
|
---|
593 | .elseif src_bpp == 0
|
---|
594 | .set src_bpp_shift, -1
|
---|
595 | .else
|
---|
596 | .error "requested src bpp (src_bpp) is not supported"
|
---|
597 | .endif
|
---|
598 | .if mask_bpp == 32
|
---|
599 | .set mask_bpp_shift, 2
|
---|
600 | .elseif mask_bpp == 24
|
---|
601 | .set mask_bpp_shift, 0
|
---|
602 | .elseif mask_bpp == 8
|
---|
603 | .set mask_bpp_shift, 0
|
---|
604 | .elseif mask_bpp == 0
|
---|
605 | .set mask_bpp_shift, -1
|
---|
606 | .else
|
---|
607 | .error "requested mask bpp (mask_bpp) is not supported"
|
---|
608 | .endif
|
---|
609 | .if dst_w_bpp == 32
|
---|
610 | .set dst_bpp_shift, 2
|
---|
611 | .elseif dst_w_bpp == 24
|
---|
612 | .set dst_bpp_shift, 0
|
---|
613 | .elseif dst_w_bpp == 16
|
---|
614 | .set dst_bpp_shift, 1
|
---|
615 | .elseif dst_w_bpp == 8
|
---|
616 | .set dst_bpp_shift, 0
|
---|
617 | .else
|
---|
618 | .error "requested dst bpp (dst_w_bpp) is not supported"
|
---|
619 | .endif
|
---|
620 |
|
---|
621 | .if (((flags) & FLAG_DST_READWRITE) != 0)
|
---|
622 | .set dst_r_bpp, dst_w_bpp
|
---|
623 | .else
|
---|
624 | .set dst_r_bpp, 0
|
---|
625 | .endif
|
---|
626 | .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
|
---|
627 | .set DEINTERLEAVE_32BPP_ENABLED, 1
|
---|
628 | .else
|
---|
629 | .set DEINTERLEAVE_32BPP_ENABLED, 0
|
---|
630 | .endif
|
---|
631 |
|
---|
632 | .if prefetch_distance < 0 || prefetch_distance > 15
|
---|
633 | .error "invalid prefetch distance (prefetch_distance)"
|
---|
634 | .endif
|
---|
635 |
|
---|
636 | .if src_bpp > 0
|
---|
637 | ldr SRC, [sp, #40]
|
---|
638 | .endif
|
---|
639 | .if mask_bpp > 0
|
---|
640 | ldr MASK, [sp, #48]
|
---|
641 | .endif
|
---|
642 | PF mov PF_X, #0
|
---|
643 | .if src_bpp > 0
|
---|
644 | ldr SRC_STRIDE, [sp, #44]
|
---|
645 | .endif
|
---|
646 | .if mask_bpp > 0
|
---|
647 | ldr MASK_STRIDE, [sp, #52]
|
---|
648 | .endif
|
---|
649 | mov DST_R, DST_W
|
---|
650 |
|
---|
651 | .if src_bpp == 24
|
---|
652 | sub SRC_STRIDE, SRC_STRIDE, W
|
---|
653 | sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
|
---|
654 | .endif
|
---|
655 | .if mask_bpp == 24
|
---|
656 | sub MASK_STRIDE, MASK_STRIDE, W
|
---|
657 | sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
|
---|
658 | .endif
|
---|
659 | .if dst_w_bpp == 24
|
---|
660 | sub DST_STRIDE, DST_STRIDE, W
|
---|
661 | sub DST_STRIDE, DST_STRIDE, W, lsl #1
|
---|
662 | .endif
|
---|
663 |
|
---|
664 | /*
|
---|
665 | * Setup advanced prefetcher initial state
|
---|
666 | */
|
---|
667 | PF mov PF_SRC, SRC
|
---|
668 | PF mov PF_DST, DST_R
|
---|
669 | PF mov PF_MASK, MASK
|
---|
670 | /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
|
---|
671 | PF mov PF_CTL, H, lsl #4
|
---|
672 | PF add PF_CTL, #(prefetch_distance - 0x10)
|
---|
673 |
|
---|
674 | init
|
---|
675 | .if regs_shortage
|
---|
676 | push {r0, r1}
|
---|
677 | .endif
|
---|
678 | subs H, H, #1
|
---|
679 | .if regs_shortage
|
---|
680 | str H, [sp, #4] /* save updated height to stack */
|
---|
681 | .else
|
---|
682 | mov ORIG_W, W
|
---|
683 | .endif
|
---|
684 | blt 9f
|
---|
685 | cmp W, #(pixblock_size * 2)
|
---|
686 | blt 8f
|
---|
687 | /*
|
---|
688 | * This is the start of the pipelined loop, which if optimized for
|
---|
689 | * long scanlines
|
---|
690 | */
|
---|
691 | 0:
|
---|
692 | ensure_destination_ptr_alignment process_pixblock_head, \
|
---|
693 | process_pixblock_tail, \
|
---|
694 | process_pixblock_tail_head
|
---|
695 |
|
---|
696 | /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
|
---|
697 | pixld_a pixblock_size, dst_r_bpp, \
|
---|
698 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
|
---|
699 | pixld pixblock_size, src_bpp, \
|
---|
700 | (src_basereg - pixblock_size * src_bpp / 64), SRC
|
---|
701 | pixld pixblock_size, mask_bpp, \
|
---|
702 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK
|
---|
703 | PF add PF_X, PF_X, #pixblock_size
|
---|
704 | process_pixblock_head
|
---|
705 | cache_preload 0, pixblock_size
|
---|
706 | cache_preload_simple
|
---|
707 | subs W, W, #(pixblock_size * 2)
|
---|
708 | blt 2f
|
---|
709 | 1:
|
---|
710 | process_pixblock_tail_head
|
---|
711 | cache_preload_simple
|
---|
712 | subs W, W, #pixblock_size
|
---|
713 | bge 1b
|
---|
714 | 2:
|
---|
715 | process_pixblock_tail
|
---|
716 | pixst_a pixblock_size, dst_w_bpp, \
|
---|
717 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
|
---|
718 |
|
---|
719 | /* Process the remaining trailing pixels in the scanline */
|
---|
720 | process_trailing_pixels 1, 1, \
|
---|
721 | process_pixblock_head, \
|
---|
722 | process_pixblock_tail, \
|
---|
723 | process_pixblock_tail_head
|
---|
724 | advance_to_next_scanline 0b
|
---|
725 |
|
---|
726 | .if regs_shortage
|
---|
727 | pop {r0, r1}
|
---|
728 | .endif
|
---|
729 | cleanup
|
---|
730 | pop {r4-r12, pc} /* exit */
|
---|
731 | /*
|
---|
732 | * This is the start of the loop, designed to process images with small width
|
---|
733 | * (less than pixblock_size * 2 pixels). In this case neither pipelining
|
---|
734 | * nor prefetch are used.
|
---|
735 | */
|
---|
736 | 8:
|
---|
737 | /* Process exactly pixblock_size pixels if needed */
|
---|
738 | tst W, #pixblock_size
|
---|
739 | beq 1f
|
---|
740 | pixld pixblock_size, dst_r_bpp, \
|
---|
741 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
|
---|
742 | pixld pixblock_size, src_bpp, \
|
---|
743 | (src_basereg - pixblock_size * src_bpp / 64), SRC
|
---|
744 | pixld pixblock_size, mask_bpp, \
|
---|
745 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK
|
---|
746 | process_pixblock_head
|
---|
747 | process_pixblock_tail
|
---|
748 | pixst pixblock_size, dst_w_bpp, \
|
---|
749 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
|
---|
750 | 1:
|
---|
751 | /* Process the remaining trailing pixels in the scanline */
|
---|
752 | process_trailing_pixels 0, 0, \
|
---|
753 | process_pixblock_head, \
|
---|
754 | process_pixblock_tail, \
|
---|
755 | process_pixblock_tail_head
|
---|
756 | advance_to_next_scanline 8b
|
---|
757 | 9:
|
---|
758 | .if regs_shortage
|
---|
759 | pop {r0, r1}
|
---|
760 | .endif
|
---|
761 | cleanup
|
---|
762 | pop {r4-r12, pc} /* exit */
|
---|
763 |
|
---|
764 | .unreq SRC
|
---|
765 | .unreq MASK
|
---|
766 | .unreq DST_R
|
---|
767 | .unreq DST_W
|
---|
768 | .unreq ORIG_W
|
---|
769 | .unreq W
|
---|
770 | .unreq H
|
---|
771 | .unreq SRC_STRIDE
|
---|
772 | .unreq DST_STRIDE
|
---|
773 | .unreq MASK_STRIDE
|
---|
774 | .unreq PF_CTL
|
---|
775 | .unreq PF_X
|
---|
776 | .unreq PF_SRC
|
---|
777 | .unreq PF_DST
|
---|
778 | .unreq PF_MASK
|
---|
779 | .unreq DUMMY
|
---|
780 | .endfunc
|
---|
781 | .endm
|
---|
782 |
|
---|
783 | /*
|
---|
784 | * A simplified variant of function generation template for a single
|
---|
785 | * scanline processing (for implementing pixman combine functions)
|
---|
786 | */
|
---|
787 | .macro generate_composite_function_single_scanline fname, \
|
---|
788 | src_bpp_, \
|
---|
789 | mask_bpp_, \
|
---|
790 | dst_w_bpp_, \
|
---|
791 | flags, \
|
---|
792 | pixblock_size_, \
|
---|
793 | init, \
|
---|
794 | cleanup, \
|
---|
795 | process_pixblock_head, \
|
---|
796 | process_pixblock_tail, \
|
---|
797 | process_pixblock_tail_head, \
|
---|
798 | dst_w_basereg_ = 28, \
|
---|
799 | dst_r_basereg_ = 4, \
|
---|
800 | src_basereg_ = 0, \
|
---|
801 | mask_basereg_ = 24
|
---|
802 |
|
---|
803 | .func fname
|
---|
804 | .global fname
|
---|
805 | /* For ELF format also set function visibility to hidden */
|
---|
806 | #ifdef __ELF__
|
---|
807 | .hidden fname
|
---|
808 | .type fname, %function
|
---|
809 | #endif
|
---|
810 | fname:
|
---|
811 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
|
---|
812 | /*
|
---|
813 | * Make some macro arguments globally visible and accessible
|
---|
814 | * from other macros
|
---|
815 | */
|
---|
816 | .set src_bpp, src_bpp_
|
---|
817 | .set mask_bpp, mask_bpp_
|
---|
818 | .set dst_w_bpp, dst_w_bpp_
|
---|
819 | .set pixblock_size, pixblock_size_
|
---|
820 | .set dst_w_basereg, dst_w_basereg_
|
---|
821 | .set dst_r_basereg, dst_r_basereg_
|
---|
822 | .set src_basereg, src_basereg_
|
---|
823 | .set mask_basereg, mask_basereg_
|
---|
824 | /*
|
---|
825 | * Assign symbolic names to registers
|
---|
826 | */
|
---|
827 | W .req r0 /* width (is updated during processing) */
|
---|
828 | DST_W .req r1 /* destination buffer pointer for writes */
|
---|
829 | SRC .req r2 /* source buffer pointer */
|
---|
830 | DST_R .req ip /* destination buffer pointer for reads */
|
---|
831 | MASK .req r3 /* mask pointer */
|
---|
832 |
|
---|
833 | .if (((flags) & FLAG_DST_READWRITE) != 0)
|
---|
834 | .set dst_r_bpp, dst_w_bpp
|
---|
835 | .else
|
---|
836 | .set dst_r_bpp, 0
|
---|
837 | .endif
|
---|
838 | .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
|
---|
839 | .set DEINTERLEAVE_32BPP_ENABLED, 1
|
---|
840 | .else
|
---|
841 | .set DEINTERLEAVE_32BPP_ENABLED, 0
|
---|
842 | .endif
|
---|
843 |
|
---|
844 | init
|
---|
845 | mov DST_R, DST_W
|
---|
846 |
|
---|
847 | cmp W, #pixblock_size
|
---|
848 | blt 8f
|
---|
849 |
|
---|
850 | ensure_destination_ptr_alignment process_pixblock_head, \
|
---|
851 | process_pixblock_tail, \
|
---|
852 | process_pixblock_tail_head
|
---|
853 |
|
---|
854 | subs W, W, #pixblock_size
|
---|
855 | blt 7f
|
---|
856 |
|
---|
857 | /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
|
---|
858 | pixld_a pixblock_size, dst_r_bpp, \
|
---|
859 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
|
---|
860 | pixld pixblock_size, src_bpp, \
|
---|
861 | (src_basereg - pixblock_size * src_bpp / 64), SRC
|
---|
862 | pixld pixblock_size, mask_bpp, \
|
---|
863 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK
|
---|
864 | process_pixblock_head
|
---|
865 | subs W, W, #pixblock_size
|
---|
866 | blt 2f
|
---|
867 | 1:
|
---|
868 | process_pixblock_tail_head
|
---|
869 | subs W, W, #pixblock_size
|
---|
870 | bge 1b
|
---|
871 | 2:
|
---|
872 | process_pixblock_tail
|
---|
873 | pixst_a pixblock_size, dst_w_bpp, \
|
---|
874 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
|
---|
875 | 7:
|
---|
876 | /* Process the remaining trailing pixels in the scanline (dst aligned) */
|
---|
877 | process_trailing_pixels 0, 1, \
|
---|
878 | process_pixblock_head, \
|
---|
879 | process_pixblock_tail, \
|
---|
880 | process_pixblock_tail_head
|
---|
881 |
|
---|
882 | cleanup
|
---|
883 | bx lr /* exit */
|
---|
884 | 8:
|
---|
885 | /* Process the remaining trailing pixels in the scanline (dst unaligned) */
|
---|
886 | process_trailing_pixels 0, 0, \
|
---|
887 | process_pixblock_head, \
|
---|
888 | process_pixblock_tail, \
|
---|
889 | process_pixblock_tail_head
|
---|
890 |
|
---|
891 | cleanup
|
---|
892 | bx lr /* exit */
|
---|
893 |
|
---|
894 | .unreq SRC
|
---|
895 | .unreq MASK
|
---|
896 | .unreq DST_R
|
---|
897 | .unreq DST_W
|
---|
898 | .unreq W
|
---|
899 | .endfunc
|
---|
900 | .endm
|
---|
901 |
|
---|
902 | .macro default_init
|
---|
903 | .endm
|
---|
904 |
|
---|
905 | .macro default_cleanup
|
---|
906 | .endm
|
---|