1 | /* match.s -- optional optimized asm version of longest match in deflate.c
|
---|
2 |
|
---|
3 | Copyright (C) 2002, 2006 Free Software Foundation, Inc.
|
---|
4 | Copyright (C) 1992-1993 Jean-loup Gailly
|
---|
5 |
|
---|
6 | This program is free software; you can redistribute it and/or modify
|
---|
7 | it under the terms of the GNU General Public License as published by
|
---|
8 | the Free Software Foundation; either version 2, or (at your option)
|
---|
9 | any later version.
|
---|
10 |
|
---|
11 | This program is distributed in the hope that it will be useful,
|
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | GNU General Public License for more details.
|
---|
15 |
|
---|
16 | You should have received a copy of the GNU General Public License
|
---|
17 | along with this program; if not, write to the Free Software Foundation,
|
---|
18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
|
---|
19 |
|
---|
20 | /*
|
---|
21 | * The 68020 version has been written by Francesco Potorti` <pot@cnuce.cnr.it>
|
---|
22 | * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
|
---|
23 | * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
|
---|
24 | * Kristoffer Eriksson <ske@pkmab.se>
|
---|
25 | *
|
---|
26 | * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
|
---|
27 | * Unwind directives and some reformatting for better readability added by
|
---|
28 | * David Mosberger-Tang <davidm@hpl.hp.com>.
|
---|
29 | */
|
---|
30 |
|
---|
31 | /* $Id: match.c,v 1.1 2006/11/20 08:40:34 eggert Exp $ */
|
---|
32 |
|
---|
33 | /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
|
---|
34 | * external symbols with an underline character '_'.
|
---|
35 | */
|
---|
36 | #ifdef NO_UNDERLINE
|
---|
37 | # define _prev prev
|
---|
38 | # define _window window
|
---|
39 | # define _match_start match_start
|
---|
40 | # define _prev_length prev_length
|
---|
41 | # define _good_match good_match
|
---|
42 | # define _nice_match nice_match
|
---|
43 | # define _strstart strstart
|
---|
44 | # define _max_chain_length max_chain_length
|
---|
45 |
|
---|
46 | # define _match_init match_init
|
---|
47 | # define _longest_match longest_match
|
---|
48 | #endif
|
---|
49 |
|
---|
50 | #ifdef DYN_ALLOC
|
---|
51 | error: DYN_ALLOC not yet supported in match.s
|
---|
52 | #endif
|
---|
53 |
|
---|
54 | #if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
|
---|
55 |
|
---|
56 | /* This version is for 386 Unix or OS/2 in 32 bit mode.
|
---|
57 | * Warning: it uses the AT&T syntax: mov source,dest
|
---|
58 | * This file is only optional. If you want to force the C version,
|
---|
59 | * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
|
---|
60 | * If you have reduced WSIZE in gzip.h, then change its value below.
|
---|
61 | * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
|
---|
62 | */
|
---|
63 |
|
---|
64 | .file "match.S"
|
---|
65 |
|
---|
66 | #define MAX_MATCH 258
|
---|
67 | #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
|
---|
68 | #define MIN_MATCH 3
|
---|
69 | #define WSIZE $32768
|
---|
70 | #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
---|
71 |
|
---|
72 | .globl _match_init
|
---|
73 | .globl _longest_match
|
---|
74 |
|
---|
75 | .text
|
---|
76 |
|
---|
77 | _match_init:
|
---|
78 | ret
|
---|
79 |
|
---|
80 | /*-----------------------------------------------------------------------
|
---|
81 | * Set match_start to the longest match starting at the given string and
|
---|
82 | * return its length. Matches shorter or equal to prev_length are discarded,
|
---|
83 | * in which case the result is equal to prev_length and match_start is
|
---|
84 | * garbage.
|
---|
85 | * IN assertions: cur_match is the head of the hash chain for the current
|
---|
86 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
---|
87 | */
|
---|
88 |
|
---|
89 | _longest_match: /* int longest_match(cur_match) */
|
---|
90 |
|
---|
91 | #define cur_match 20(%esp)
|
---|
92 | /* return address */ /* esp+16 */
|
---|
93 | push %ebp /* esp+12 */
|
---|
94 | push %edi /* esp+8 */
|
---|
95 | push %esi /* esp+4 */
|
---|
96 | push %ebx /* esp */
|
---|
97 |
|
---|
98 | /*
|
---|
99 | * match equ esi
|
---|
100 | * scan equ edi
|
---|
101 | * chain_length equ ebp
|
---|
102 | * best_len equ ebx
|
---|
103 | * limit equ edx
|
---|
104 | */
|
---|
105 | mov cur_match,%esi
|
---|
106 | mov _max_chain_length,%ebp /* chain_length = max_chain_length */
|
---|
107 | mov _strstart,%edi
|
---|
108 | mov %edi,%edx
|
---|
109 | sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
|
---|
110 | jae limit_ok
|
---|
111 | sub %edx,%edx /* limit = NIL */
|
---|
112 | limit_ok:
|
---|
113 | add $2+_window,%edi /* edi = offset(window+strstart+2) */
|
---|
114 | mov _prev_length,%ebx /* best_len = prev_length */
|
---|
115 | movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
---|
116 | movw -2(%edi),%cx /* cx = scan[0..1] */
|
---|
117 | cmp _good_match,%ebx /* do we have a good match already? */
|
---|
118 | jb do_scan
|
---|
119 | shr $2,%ebp /* chain_length >>= 2 */
|
---|
120 | jmp do_scan
|
---|
121 |
|
---|
122 | .align 4
|
---|
123 | long_loop:
|
---|
124 | /* at this point, edi == scan+2, esi == cur_match */
|
---|
125 | movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
---|
126 | movw -2(%edi),%cx /* cx = scan[0..1] */
|
---|
127 | short_loop:
|
---|
128 | /*
|
---|
129 | * at this point, di == scan+2, si == cur_match,
|
---|
130 | * ax = scan[best_len-1..best_len] and cx = scan[0..1]
|
---|
131 | */
|
---|
132 | and WSIZE-1, %esi
|
---|
133 | movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
|
---|
134 | /* top word of esi is still 0 */
|
---|
135 | cmp %edx,%esi /* cur_match <= limit ? */
|
---|
136 | jbe the_end
|
---|
137 | dec %ebp /* --chain_length */
|
---|
138 | jz the_end
|
---|
139 | do_scan:
|
---|
140 | cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
|
---|
141 | jne short_loop
|
---|
142 | cmpw _window(%esi),%cx /* check min_match_length match */
|
---|
143 | jne short_loop
|
---|
144 |
|
---|
145 | lea _window+2(%esi),%esi /* si = match */
|
---|
146 | mov %edi,%eax /* ax = scan+2 */
|
---|
147 | mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
|
---|
148 | rep; cmpsw /* loop until mismatch */
|
---|
149 | je maxmatch /* match of length MAX_MATCH? */
|
---|
150 | mismatch:
|
---|
151 | movb -2(%edi),%cl /* mismatch on first or second byte? */
|
---|
152 | subb -2(%esi),%cl /* cl = 0 if first bytes equal */
|
---|
153 | xchg %edi,%eax /* edi = scan+2, eax = end of scan */
|
---|
154 | sub %edi,%eax /* eax = len */
|
---|
155 | sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
|
---|
156 | sub $2+_window,%esi /* esi = cur_match */
|
---|
157 | subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
|
---|
158 | adc $0,%eax /* eax = carry ? len+1 : len */
|
---|
159 | cmp %ebx,%eax /* len > best_len ? */
|
---|
160 | jle long_loop
|
---|
161 | mov %esi,_match_start /* match_start = cur_match */
|
---|
162 | mov %eax,%ebx /* ebx = best_len = len */
|
---|
163 | cmp _nice_match,%eax /* len >= nice_match ? */
|
---|
164 | jl long_loop
|
---|
165 | the_end:
|
---|
166 | mov %ebx,%eax /* result = eax = best_len */
|
---|
167 | pop %ebx
|
---|
168 | pop %esi
|
---|
169 | pop %edi
|
---|
170 | pop %ebp
|
---|
171 | ret
|
---|
172 | maxmatch:
|
---|
173 | cmpsb
|
---|
174 | jmp mismatch
|
---|
175 |
|
---|
176 | #else
|
---|
177 |
|
---|
178 | /* ======================== 680x0 version ================================= */
|
---|
179 |
|
---|
180 | #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
|
---|
181 | # ifndef mc68000
|
---|
182 | # define mc68000
|
---|
183 | # endif
|
---|
184 | #endif
|
---|
185 |
|
---|
186 | #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
|
---|
187 | # ifndef mc68020
|
---|
188 | # define mc68020
|
---|
189 | # endif
|
---|
190 | #endif
|
---|
191 |
|
---|
192 | #if defined(mc68020) || defined(mc68000)
|
---|
193 |
|
---|
194 | #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
|
---|
195 | # define UNALIGNED_OK
|
---|
196 | #endif
|
---|
197 |
|
---|
198 | #ifdef sysV68 /* Try Motorola Delta style */
|
---|
199 |
|
---|
200 | # define GLOBAL(symbol) global symbol
|
---|
201 | # define TEXT text
|
---|
202 | # define FILE(filename) file filename
|
---|
203 | # define invert_maybe(src,dst) dst,src
|
---|
204 | # define imm(data) &data
|
---|
205 | # define reg(register) %register
|
---|
206 |
|
---|
207 | # define addl add.l
|
---|
208 | # define addql addq.l
|
---|
209 | # define blos blo.b
|
---|
210 | # define bhis bhi.b
|
---|
211 | # define bras bra.b
|
---|
212 | # define clrl clr.l
|
---|
213 | # define cmpmb cmpm.b
|
---|
214 | # define cmpw cmp.w
|
---|
215 | # define cmpl cmp.l
|
---|
216 | # define lslw lsl.w
|
---|
217 | # define lsrl lsr.l
|
---|
218 | # define movel move.l
|
---|
219 | # define movew move.w
|
---|
220 | # define moveb move.b
|
---|
221 | # define moveml movem.l
|
---|
222 | # define subl sub.l
|
---|
223 | # define subw sub.w
|
---|
224 | # define subql subq.l
|
---|
225 |
|
---|
226 | # define IndBase(bd,An) (bd,An)
|
---|
227 | # define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
|
---|
228 | # define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
|
---|
229 | # define predec(An) -(An)
|
---|
230 | # define postinc(An) (An)+
|
---|
231 |
|
---|
232 | #else /* default style (Sun 3, NeXT, Amiga, Atari) */
|
---|
233 |
|
---|
234 | # define GLOBAL(symbol) .globl symbol
|
---|
235 | # define TEXT .text
|
---|
236 | # define FILE(filename) .even
|
---|
237 | # define invert_maybe(src,dst) src,dst
|
---|
238 | # if defined(sun) || defined(mc68k)
|
---|
239 | # define imm(data) #data
|
---|
240 | # else
|
---|
241 | # define imm(data) \#data
|
---|
242 | # endif
|
---|
243 | # define reg(register) register
|
---|
244 |
|
---|
245 | # define blos bcss
|
---|
246 | # if defined(sun) || defined(mc68k)
|
---|
247 | # define movel movl
|
---|
248 | # define movew movw
|
---|
249 | # define moveb movb
|
---|
250 | # endif
|
---|
251 | # define IndBase(bd,An) An@(bd)
|
---|
252 | # define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
|
---|
253 | # define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
|
---|
254 | # define predec(An) An@-
|
---|
255 | # define postinc(An) An@+
|
---|
256 |
|
---|
257 | #endif /* styles */
|
---|
258 |
|
---|
259 | #define Best_Len reg(d0) /* unsigned */
|
---|
260 | #define Cur_Match reg(d1) /* Ipos */
|
---|
261 | #define Loop_Counter reg(d2) /* int */
|
---|
262 | #define Scan_Start reg(d3) /* unsigned short */
|
---|
263 | #define Scan_End reg(d4) /* unsigned short */
|
---|
264 | #define Limit reg(d5) /* IPos */
|
---|
265 | #define Chain_Length reg(d6) /* unsigned */
|
---|
266 | #define Scan_Test reg(d7)
|
---|
267 | #define Scan reg(a0) /* *uch */
|
---|
268 | #define Match reg(a1) /* *uch */
|
---|
269 | #define Prev_Address reg(a2) /* *Pos */
|
---|
270 | #define Scan_Ini reg(a3) /* *uch */
|
---|
271 | #define Match_Ini reg(a4) /* *uch */
|
---|
272 | #define Stack_Pointer reg(sp)
|
---|
273 |
|
---|
274 | #define MAX_MATCH 258
|
---|
275 | #define MIN_MATCH 3
|
---|
276 | #define WSIZE 32768
|
---|
277 | #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
|
---|
278 |
|
---|
279 | GLOBAL (_match_init)
|
---|
280 | GLOBAL (_longest_match)
|
---|
281 |
|
---|
282 | TEXT
|
---|
283 |
|
---|
284 | FILE ("match.S")
|
---|
285 |
|
---|
286 | _match_init:
|
---|
287 | rts
|
---|
288 |
|
---|
289 | /*-----------------------------------------------------------------------
|
---|
290 | * Set match_start to the longest match starting at the given string and
|
---|
291 | * return its length. Matches shorter or equal to prev_length are discarded,
|
---|
292 | * in which case the result is equal to prev_length and match_start is
|
---|
293 | * garbage.
|
---|
294 | * IN assertions: cur_match is the head of the hash chain for the current
|
---|
295 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
---|
296 | */
|
---|
297 |
|
---|
298 | /* int longest_match (cur_match) */
|
---|
299 |
|
---|
300 | #ifdef UNALIGNED_OK
|
---|
301 | # define pushreg 15928 /* d2-d6/a2-a4 */
|
---|
302 | # define popreg 7292
|
---|
303 | #else
|
---|
304 | # define pushreg 16184 /* d2-d7/a2-a4 */
|
---|
305 | # define popreg 7420
|
---|
306 | #endif
|
---|
307 |
|
---|
308 | _longest_match:
|
---|
309 | movel IndBase(4,Stack_Pointer),Cur_Match
|
---|
310 | moveml imm(pushreg),predec(Stack_Pointer)
|
---|
311 | movel _max_chain_length,Chain_Length
|
---|
312 | movel _prev_length,Best_Len
|
---|
313 | movel imm(_prev),Prev_Address
|
---|
314 | movel imm(_window+MIN_MATCH),Match_Ini
|
---|
315 | movel _strstart,Limit
|
---|
316 | movel Match_Ini,Scan_Ini
|
---|
317 | addl Limit,Scan_Ini
|
---|
318 | subw imm(MAX_DIST),Limit
|
---|
319 | bhis L__limit_ok
|
---|
320 | clrl Limit
|
---|
321 | L__limit_ok:
|
---|
322 | cmpl invert_maybe(_good_match,Best_Len)
|
---|
323 | blos L__length_ok
|
---|
324 | lsrl imm(2),Chain_Length
|
---|
325 | L__length_ok:
|
---|
326 | subql imm(1),Chain_Length
|
---|
327 | #ifdef UNALIGNED_OK
|
---|
328 | movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
---|
329 | movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
---|
330 | #else
|
---|
331 | moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
---|
332 | lslw imm(8),Scan_Start
|
---|
333 | moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
|
---|
334 | moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
---|
335 | lslw imm(8),Scan_End
|
---|
336 | moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
---|
337 | #endif
|
---|
338 | bras L__do_scan
|
---|
339 |
|
---|
340 | L__long_loop:
|
---|
341 | #ifdef UNALIGNED_OK
|
---|
342 | movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
---|
343 | #else
|
---|
344 | moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
---|
345 | lslw imm(8),Scan_End
|
---|
346 | moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
---|
347 | #endif
|
---|
348 |
|
---|
349 | L__short_loop:
|
---|
350 | lslw imm(1),Cur_Match
|
---|
351 | movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
|
---|
352 | cmpw invert_maybe(Limit,Cur_Match)
|
---|
353 | dbls Chain_Length,L__do_scan
|
---|
354 | bras L__return
|
---|
355 |
|
---|
356 | L__do_scan:
|
---|
357 | movel Match_Ini,Match
|
---|
358 | addl Cur_Match,Match
|
---|
359 | #ifdef UNALIGNED_OK
|
---|
360 | cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
|
---|
361 | bne L__short_loop
|
---|
362 | cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
|
---|
363 | bne L__short_loop
|
---|
364 | #else
|
---|
365 | moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
|
---|
366 | lslw imm(8),Scan_Test
|
---|
367 | moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
|
---|
368 | cmpw invert_maybe(Scan_Test,Scan_End)
|
---|
369 | bne L__short_loop
|
---|
370 | moveb IndBase(-MIN_MATCH,Match),Scan_Test
|
---|
371 | lslw imm(8),Scan_Test
|
---|
372 | moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
|
---|
373 | cmpw invert_maybe(Scan_Test,Scan_Start)
|
---|
374 | bne L__short_loop
|
---|
375 | #endif
|
---|
376 |
|
---|
377 | movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
|
---|
378 | movel Scan_Ini,Scan
|
---|
379 | L__scan_loop:
|
---|
380 | cmpmb postinc(Match),postinc(Scan)
|
---|
381 | dbne Loop_Counter,L__scan_loop
|
---|
382 |
|
---|
383 | subl Scan_Ini,Scan
|
---|
384 | addql imm(MIN_MATCH-1),Scan
|
---|
385 | cmpl invert_maybe(Best_Len,Scan)
|
---|
386 | bls L__short_loop
|
---|
387 | movel Scan,Best_Len
|
---|
388 | movel Cur_Match,_match_start
|
---|
389 | cmpl invert_maybe(_nice_match,Best_Len)
|
---|
390 | blos L__long_loop
|
---|
391 | L__return:
|
---|
392 | moveml postinc(Stack_Pointer),imm(popreg)
|
---|
393 | rts
|
---|
394 |
|
---|
395 | #else
|
---|
396 |
|
---|
397 | # if defined (__ia64__)
|
---|
398 |
|
---|
399 | /* ======================== ia64 version ================================= */
|
---|
400 |
|
---|
401 | /*
|
---|
402 | * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
|
---|
403 | *
|
---|
404 | * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
|
---|
405 | * possible.
|
---|
406 | *
|
---|
407 | * Copyright: Sverre Jarp (HP Labs) 2001-2002
|
---|
408 | *
|
---|
409 | * See deflate.c for c-version
|
---|
410 | * Version 2 - Optimize the outer loop
|
---|
411 | */
|
---|
412 |
|
---|
413 | #include <endian.h>
|
---|
414 |
|
---|
415 | #if __BYTE_ORDER == ____BIG_ENDIAN
|
---|
416 | #define first shl
|
---|
417 | #define second shr.u
|
---|
418 | #define count czx1.l
|
---|
419 | #else
|
---|
420 | #define first shr.u
|
---|
421 | #define second shl
|
---|
422 | #define count czx1.r
|
---|
423 | #endif
|
---|
424 |
|
---|
425 | // 24 rotating register (r32 - r55)
|
---|
426 |
|
---|
427 | #define s_vmatch0 r32
|
---|
428 | #define s_vmatch1 r33
|
---|
429 | #define s_vmatbst r34
|
---|
430 | #define s_vmatbst1 r35
|
---|
431 | #define s_amatblen r36
|
---|
432 |
|
---|
433 | #define s_tm1 r56
|
---|
434 | #define s_tm2 r57
|
---|
435 | #define s_tm3 r58
|
---|
436 | #define s_tm4 r59
|
---|
437 | #define s_tm5 r60
|
---|
438 | #define s_tm6 r61
|
---|
439 | #define s_tm7 r62
|
---|
440 | #define s_tm8 r63
|
---|
441 |
|
---|
442 | #define s_vlen r31
|
---|
443 | #define s_vstrstart r30
|
---|
444 | #define s_vchainlen r29
|
---|
445 | #define s_awinbest r28
|
---|
446 | #define s_vcurmatch r27
|
---|
447 | #define s_vlimit r26
|
---|
448 | #define s_vscanend r25
|
---|
449 | #define s_vscanend1 r24
|
---|
450 | #define s_anicematch r23
|
---|
451 | #define s_vscan0 r22
|
---|
452 | #define s_vscan1 r21
|
---|
453 |
|
---|
454 | #define s_aprev r20
|
---|
455 | #define s_awindow r19
|
---|
456 | #define s_amatchstart r18
|
---|
457 | #define s_ascan r17
|
---|
458 | #define s_amatch r16
|
---|
459 | #define s_wmask r15
|
---|
460 | #define s_ascanend r14
|
---|
461 |
|
---|
462 | #define s_vspec_cmatch r11 // next iteration
|
---|
463 | #define s_lcsave r10
|
---|
464 | #define s_prsave r9
|
---|
465 | #define s_vbestlen r8 // return register
|
---|
466 |
|
---|
467 | #define s_vscan3 r3
|
---|
468 | #define s_vmatch3 r2
|
---|
469 |
|
---|
470 | #define p_no p2
|
---|
471 | #define p_yes p3
|
---|
472 | #define p_shf p4 //
|
---|
473 | #define p_bn2 p5 // Use in loop (indicating bestlen != 2)
|
---|
474 |
|
---|
475 | #define p_nbs p9 // not new best_len
|
---|
476 | #define p_nnc p10 // not nice_length
|
---|
477 | #define p_ll p11
|
---|
478 | #define p_end p12
|
---|
479 |
|
---|
480 | #define MAX_MATCH 258
|
---|
481 | #define MIN_MATCH 4
|
---|
482 | #define WSIZE 32768
|
---|
483 | #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
---|
484 |
|
---|
485 | #define R_INPUT 1
|
---|
486 | #define R_LOCAL 31
|
---|
487 | #define R_OUTPUT 0
|
---|
488 | #define R_ROTATING 24
|
---|
489 | #define MLAT 3
|
---|
490 | #define SHLAT 2
|
---|
491 |
|
---|
492 | #define mova mov
|
---|
493 | #define movi0 mov
|
---|
494 | #define cgtu cmp.gt.unc
|
---|
495 | #define cgeu cmp.ge.unc
|
---|
496 | #define cneu cmp.ne.unc
|
---|
497 |
|
---|
498 | .global longest_match
|
---|
499 | .proc longest_match
|
---|
500 | .align 32
|
---|
501 | longest_match:
|
---|
502 | // -- Cycle: 0
|
---|
503 | .prologue
|
---|
504 | {.mmi
|
---|
505 | alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
|
---|
506 | .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
|
---|
507 | shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
|
---|
508 | .rotp lc[MLAT+SHLAT+2]
|
---|
509 | mova s_vspec_cmatch=in0 // cur_match from input register
|
---|
510 | add s_tm1=@gprel(strstart),gp // a(a(strstart))
|
---|
511 | }{.mmi
|
---|
512 | add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
|
---|
513 | add s_tm5=@ltoff(window),gp // a(a(window))
|
---|
514 | add s_tm6=@ltoff(prev),gp // a(a(prev))
|
---|
515 | ;;
|
---|
516 | }{.mmb // Cycle: 1
|
---|
517 | ld4 s_vstrstart=[s_tm1] // strstart
|
---|
518 | ld4 s_vbestlen=[s_tm3] // best_len = prev_length
|
---|
519 | brp.loop.imp .cmploop,.cmploop+48
|
---|
520 | }{.mli
|
---|
521 | add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
|
---|
522 | movl s_wmask=WSIZE-1
|
---|
523 | ;;
|
---|
524 | }{.mmi // Cycle: 2
|
---|
525 | ld8 s_aprev=[s_tm6] // a(prev)
|
---|
526 | ld8 s_awindow=[s_tm5] // a(window)
|
---|
527 | .save pr, s_prsave
|
---|
528 | movi0 s_prsave=pr // save predicates
|
---|
529 | }{.mmi
|
---|
530 | add s_tm4=@gprel(good_match),gp // a(a(good_match))
|
---|
531 | add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
|
---|
532 | add s_tm8=@ltoff(match_start),gp // a(match_start)
|
---|
533 | ;;
|
---|
534 | }{.mmi // Cycle: 3
|
---|
535 | ld8 s_anicematch=[s_tm7] // a(nice_match)
|
---|
536 | ld8 s_amatchstart=[s_tm8] // a(match_start)
|
---|
537 | .save ar.lc, s_lcsave
|
---|
538 | movi0 s_lcsave=ar.lc // save loop count register
|
---|
539 | }{.mmi
|
---|
540 | .body
|
---|
541 | add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
|
---|
542 | cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
|
---|
543 | mova s_vcurmatch=s_vspec_cmatch
|
---|
544 | ;;
|
---|
545 | }{.mmi // Cycle: 4
|
---|
546 | ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
|
---|
547 | ld4 s_tm4=[s_tm4] // v(good_match)
|
---|
548 | add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
|
---|
549 | }{.mmi
|
---|
550 | sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
|
---|
551 | add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
---|
552 | and s_vspec_cmatch =s_vspec_cmatch,s_wmask
|
---|
553 | ;;
|
---|
554 | }{.mmi // Cycle: 5
|
---|
555 | add s_amatblen=s_amatch,s_vbestlen //
|
---|
556 | cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
---|
557 | add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
|
---|
558 | }{.mmi
|
---|
559 | ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
|
---|
560 | ld1 s_vmatch0=[s_amatch],1
|
---|
561 | cgtu p0,p_no=s_vlimit,r0 // is result positive ?
|
---|
562 | ;;
|
---|
563 | }{.mmi // Cycle: 6
|
---|
564 | ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
|
---|
565 | ld1.nt1 s_vmatch1=[s_amatch],2
|
---|
566 | add s_awinbest=s_awindow,s_vbestlen //
|
---|
567 | ;;
|
---|
568 | }{.mmi // Cycle: 7
|
---|
569 | ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
---|
570 | ld1.nt1 s_vmatbst=[s_amatblen],-1
|
---|
571 | (p_no) mova s_vlimit=r0
|
---|
572 | ;;
|
---|
573 | }{.mmi // Cycle: 8
|
---|
574 | (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
|
---|
575 | (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
|
---|
576 | shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
|
---|
577 | }{.mmi
|
---|
578 | cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
|
---|
579 | ;;
|
---|
580 | }{.mmi // Cycle: 9
|
---|
581 | ld1.nt1 s_vscan3=[s_ascan]
|
---|
582 | ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
|
---|
583 | mova s_vlen=3
|
---|
584 | }{.mmi
|
---|
585 | (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
|
---|
586 | ;;
|
---|
587 | }{.mmi // Cycle: 10
|
---|
588 | ld1.nt1 s_vmatch3=[s_amatch]
|
---|
589 | // p_ll switched on as soon as we get a mismatch:
|
---|
590 | cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
|
---|
591 | cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
|
---|
592 | }{.mib
|
---|
593 | cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
|
---|
594 | (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
|
---|
595 | (p_ll) br.cond.dpnt.many .test_more
|
---|
596 | ;;
|
---|
597 | }
|
---|
598 |
|
---|
599 | .next_iter:
|
---|
600 | {.mmi // Cycle 0
|
---|
601 | add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
---|
602 | mov s_vcurmatch=s_vspec_cmatch // current value
|
---|
603 | add s_vchainlen=-1,s_vchainlen // --chain_length
|
---|
604 | }{.mib
|
---|
605 | cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
|
---|
606 | and s_vspec_cmatch=s_vspec_cmatch,s_wmask
|
---|
607 | (p_end) br.cond.dptk.many .terminate
|
---|
608 | ;;
|
---|
609 | }{.mmi // Cycle 1
|
---|
610 | ld1 s_vmatch0=[s_amatch],1 // load match[0]
|
---|
611 | // compute prev[cur_match]:
|
---|
612 | shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
|
---|
613 | cmp.eq.unc p_end,p0=s_vchainlen,r0
|
---|
614 | } {.mib
|
---|
615 | nop.m 0
|
---|
616 | add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
|
---|
617 | (p_end) br.cond.dptk.many .terminate
|
---|
618 | ;;
|
---|
619 | }{.mmi // Cycle 2 (short)
|
---|
620 | ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
|
---|
621 | ;;
|
---|
622 | }{.mmi // Cycle 3 (short)
|
---|
623 | ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
|
---|
624 | cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
|
---|
625 | ;;
|
---|
626 | }{.mmi // Cycle 4 (short)
|
---|
627 | // load match[1] - - note: match += 3 (in total):
|
---|
628 | ld1.nt1 s_vmatch1=[s_amatch],2
|
---|
629 | ;;
|
---|
630 | // Cycle 5 (short)
|
---|
631 | (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
|
---|
632 | }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
|
---|
633 | // p_ll switched on as soon as we get a mismatch:
|
---|
634 | cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
|
---|
635 | cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
|
---|
636 | (p_ll) br.cond.dptk.many .next_iter
|
---|
637 | ;;
|
---|
638 | }{.mmi // Cycle 6
|
---|
639 | ld1.nt1 s_vmatch3=[s_amatch]
|
---|
640 | mova s_vlen=3
|
---|
641 | nop.i 0
|
---|
642 | }{.mib
|
---|
643 | cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
|
---|
644 | (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
|
---|
645 | (p_ll) br.cond.dptk.many .next_iter
|
---|
646 | ;;
|
---|
647 | }
|
---|
648 |
|
---|
649 | // We have passed the first hurdle - Are there additional matches ???
|
---|
650 |
|
---|
651 | .test_more:
|
---|
652 | {.mmi // Cycle 0
|
---|
653 | and s_tm3=7,s_ascan // get byte offset
|
---|
654 | and s_tm4=7,s_amatch // get byte offset
|
---|
655 | movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
|
---|
656 | }{.mib
|
---|
657 | cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
|
---|
658 | (p_no) br.cond.dptk.many .only3
|
---|
659 | ;;
|
---|
660 | }{.mmi // Cycle 1
|
---|
661 | and s_tm1=-8,s_ascan // get aligned address
|
---|
662 | shladd s_tm3=s_tm3,3,r0
|
---|
663 | movi0 ar.lc=31 // 32 times around the loop (8B at a time)
|
---|
664 | }{.mib
|
---|
665 | and s_tm2=-8,s_amatch // get aligned address
|
---|
666 | shladd s_tm4=s_tm4,3,r0
|
---|
667 | nop.b 0
|
---|
668 | ;;
|
---|
669 | }{.mmi // Cycle 2
|
---|
670 | ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
|
---|
671 | sub s_tm5=64,s_tm3 // 64 - amount
|
---|
672 | movi0 pr.rot=1<<16
|
---|
673 | }{.mmi
|
---|
674 | ld8.nt1 match[1]=[s_tm2],8 // load first chunk
|
---|
675 | sub s_tm6=64,s_tm4 // 64 - amount
|
---|
676 | add s_vlen=-8,s_vlen // will be updated at least once
|
---|
677 | ;;
|
---|
678 | }
|
---|
679 | .align 32
|
---|
680 | .cmploop:
|
---|
681 | {.mmi // Cycle 0
|
---|
682 | (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
|
---|
683 | (lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
|
---|
684 | (lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
|
---|
685 | }{.mib
|
---|
686 | (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
|
---|
687 | (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
|
---|
688 | (p_no) br.cond.dpnt.many .mismatch
|
---|
689 | ;;
|
---|
690 | }{.mii // Cycle 1
|
---|
691 | (lc[0]) ld8 match[0]=[s_tm2],8
|
---|
692 | // shift left(le) or right(be):
|
---|
693 | (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
|
---|
694 | (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
|
---|
695 | }{.mmb
|
---|
696 | (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
|
---|
697 | (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
|
---|
698 | br.ctop.dptk.many .cmploop
|
---|
699 | ;;
|
---|
700 | }{.mfi
|
---|
701 | mov s_vlen=258
|
---|
702 | nop.f 0
|
---|
703 | }{.mfi
|
---|
704 | nop.f 0 // realign
|
---|
705 | ;;
|
---|
706 | }
|
---|
707 | .mismatch:
|
---|
708 | {.mii // Cycle 0 (short)
|
---|
709 | (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
|
---|
710 | nop.i 0
|
---|
711 | ;;
|
---|
712 | // Cycle 1 (short)
|
---|
713 | (p_no) count s_tm1=s_tm2
|
---|
714 | ;;
|
---|
715 | }{.mib // Cycle 2 (short)
|
---|
716 | (p_no) add s_vlen=s_vlen,s_tm1 // effective length
|
---|
717 | nop.i 0
|
---|
718 | clrrrb
|
---|
719 | ;;
|
---|
720 | }
|
---|
721 |
|
---|
722 | .only3:
|
---|
723 | {.mib // Cycle 0 (short)
|
---|
724 | cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
|
---|
725 | (p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate
|
---|
726 | ;;
|
---|
727 | }{.mmi // Cycle 1 (short)
|
---|
728 | ld4 s_tm7=[s_anicematch] // nice_match
|
---|
729 | st4 [s_amatchstart]= s_vcurmatch
|
---|
730 | add s_ascanend=s_ascan,s_vlen // reset with best_len
|
---|
731 | ;;
|
---|
732 | }{.mmi // Cycle 2 (short)
|
---|
733 | mova s_vbestlen=s_vlen
|
---|
734 | add s_ascanend=-3,s_ascanend // remember extra offset
|
---|
735 | ;;
|
---|
736 | }{.mmi // Cycle 3 (short)
|
---|
737 | ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
---|
738 | add s_awinbest=s_awindow,s_vbestlen // update with new best_len
|
---|
739 | cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
---|
740 | ;;
|
---|
741 | }{.mib // Cycle 4 (short)
|
---|
742 | // scan_end1=scan[best_len-1] NB: s_ascanend reset:
|
---|
743 | ld1.nt1 s_vscanend1=[s_ascanend],1
|
---|
744 | cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
|
---|
745 | (p_nnc) br.cond.dptk.many .next_iter
|
---|
746 | ;;
|
---|
747 | }
|
---|
748 | .terminate:
|
---|
749 | {.mii // Cycle 0/1
|
---|
750 | nop.m 0
|
---|
751 | movi0 ar.lc=s_lcsave
|
---|
752 | movi0 pr=s_prsave,-1
|
---|
753 | }{.mbb
|
---|
754 | nop.m 0
|
---|
755 | nop.b 0
|
---|
756 | br.ret.sptk.many rp // ret0 is identical to best_len
|
---|
757 | ;;
|
---|
758 | }
|
---|
759 | .endp
|
---|
760 |
|
---|
761 | .global match_init
|
---|
762 | .proc match_init
|
---|
763 | match_init:
|
---|
764 | sub ret0=ret0,ret0
|
---|
765 | br.ret.sptk.many rp
|
---|
766 | .endp
|
---|
767 |
|
---|
768 | # else
|
---|
769 | error: this asm version is for 386 or 680x0 or ia64 only
|
---|
770 | # endif /* __ia64__ */
|
---|
771 | #endif /* mc68000 || mc68020 */
|
---|
772 | #endif /* i386 || _I386 */
|
---|