source: trunk/essentials/app-arch/gzip/lib/match.c

Last change on this file was 3325, checked in by bird, 18 years ago

gzip 1.3.11

File size: 22.0 KB
Line 
1/* match.s -- optional optimized asm version of longest match in deflate.c
2
3 Copyright (C) 2002, 2006 Free Software Foundation, Inc.
4 Copyright (C) 1992-1993 Jean-loup Gailly
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20/*
21 * The 68020 version has been written by Francesco Potorti` <pot@cnuce.cnr.it>
22 * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
23 * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
24 * Kristoffer Eriksson <ske@pkmab.se>
25 *
26 * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
27 * Unwind directives and some reformatting for better readability added by
28 * David Mosberger-Tang <davidm@hpl.hp.com>.
29 */
30
31/* $Id: match.c,v 1.1 2006/11/20 08:40:34 eggert Exp $ */
32
33/* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
34 * external symbols with an underline character '_'.
35 */
36#ifdef NO_UNDERLINE
37# define _prev prev
38# define _window window
39# define _match_start match_start
40# define _prev_length prev_length
41# define _good_match good_match
42# define _nice_match nice_match
43# define _strstart strstart
44# define _max_chain_length max_chain_length
45
46# define _match_init match_init
47# define _longest_match longest_match
48#endif
49
50#ifdef DYN_ALLOC
51 error: DYN_ALLOC not yet supported in match.s
52#endif
53
54#if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
55
56/* This version is for 386 Unix or OS/2 in 32 bit mode.
57 * Warning: it uses the AT&T syntax: mov source,dest
58 * This file is only optional. If you want to force the C version,
59 * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
60 * If you have reduced WSIZE in gzip.h, then change its value below.
61 * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
62 */
63
64 .file "match.S"
65
66#define MAX_MATCH 258
67#define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
68#define MIN_MATCH 3
69#define WSIZE $32768
70#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
71
72 .globl _match_init
73 .globl _longest_match
74
75 .text
76
77_match_init:
78 ret
79
80/*-----------------------------------------------------------------------
81 * Set match_start to the longest match starting at the given string and
82 * return its length. Matches shorter or equal to prev_length are discarded,
83 * in which case the result is equal to prev_length and match_start is
84 * garbage.
85 * IN assertions: cur_match is the head of the hash chain for the current
86 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
87 */
88
89_longest_match: /* int longest_match(cur_match) */
90
91#define cur_match 20(%esp)
92 /* return address */ /* esp+16 */
93 push %ebp /* esp+12 */
94 push %edi /* esp+8 */
95 push %esi /* esp+4 */
96 push %ebx /* esp */
97
98/*
99 * match equ esi
100 * scan equ edi
101 * chain_length equ ebp
102 * best_len equ ebx
103 * limit equ edx
104 */
105 mov cur_match,%esi
106 mov _max_chain_length,%ebp /* chain_length = max_chain_length */
107 mov _strstart,%edi
108 mov %edi,%edx
109 sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
110 jae limit_ok
111 sub %edx,%edx /* limit = NIL */
112limit_ok:
113 add $2+_window,%edi /* edi = offset(window+strstart+2) */
114 mov _prev_length,%ebx /* best_len = prev_length */
115 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
116 movw -2(%edi),%cx /* cx = scan[0..1] */
117 cmp _good_match,%ebx /* do we have a good match already? */
118 jb do_scan
119 shr $2,%ebp /* chain_length >>= 2 */
120 jmp do_scan
121
122 .align 4
123long_loop:
124/* at this point, edi == scan+2, esi == cur_match */
125 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
126 movw -2(%edi),%cx /* cx = scan[0..1] */
127short_loop:
128/*
129 * at this point, di == scan+2, si == cur_match,
130 * ax = scan[best_len-1..best_len] and cx = scan[0..1]
131 */
132 and WSIZE-1, %esi
133 movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
134 /* top word of esi is still 0 */
135 cmp %edx,%esi /* cur_match <= limit ? */
136 jbe the_end
137 dec %ebp /* --chain_length */
138 jz the_end
139do_scan:
140 cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
141 jne short_loop
142 cmpw _window(%esi),%cx /* check min_match_length match */
143 jne short_loop
144
145 lea _window+2(%esi),%esi /* si = match */
146 mov %edi,%eax /* ax = scan+2 */
147 mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
148 rep; cmpsw /* loop until mismatch */
149 je maxmatch /* match of length MAX_MATCH? */
150mismatch:
151 movb -2(%edi),%cl /* mismatch on first or second byte? */
152 subb -2(%esi),%cl /* cl = 0 if first bytes equal */
153 xchg %edi,%eax /* edi = scan+2, eax = end of scan */
154 sub %edi,%eax /* eax = len */
155 sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
156 sub $2+_window,%esi /* esi = cur_match */
157 subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
158 adc $0,%eax /* eax = carry ? len+1 : len */
159 cmp %ebx,%eax /* len > best_len ? */
160 jle long_loop
161 mov %esi,_match_start /* match_start = cur_match */
162 mov %eax,%ebx /* ebx = best_len = len */
163 cmp _nice_match,%eax /* len >= nice_match ? */
164 jl long_loop
165the_end:
166 mov %ebx,%eax /* result = eax = best_len */
167 pop %ebx
168 pop %esi
169 pop %edi
170 pop %ebp
171 ret
172maxmatch:
173 cmpsb
174 jmp mismatch
175
176#else
177
178/* ======================== 680x0 version ================================= */
179
180#if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
181# ifndef mc68000
182# define mc68000
183# endif
184#endif
185
186#if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
187# ifndef mc68020
188# define mc68020
189# endif
190#endif
191
192#if defined(mc68020) || defined(mc68000)
193
194#if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
195# define UNALIGNED_OK
196#endif
197
198#ifdef sysV68 /* Try Motorola Delta style */
199
200# define GLOBAL(symbol) global symbol
201# define TEXT text
202# define FILE(filename) file filename
203# define invert_maybe(src,dst) dst,src
204# define imm(data) &data
205# define reg(register) %register
206
207# define addl add.l
208# define addql addq.l
209# define blos blo.b
210# define bhis bhi.b
211# define bras bra.b
212# define clrl clr.l
213# define cmpmb cmpm.b
214# define cmpw cmp.w
215# define cmpl cmp.l
216# define lslw lsl.w
217# define lsrl lsr.l
218# define movel move.l
219# define movew move.w
220# define moveb move.b
221# define moveml movem.l
222# define subl sub.l
223# define subw sub.w
224# define subql subq.l
225
226# define IndBase(bd,An) (bd,An)
227# define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
228# define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
229# define predec(An) -(An)
230# define postinc(An) (An)+
231
232#else /* default style (Sun 3, NeXT, Amiga, Atari) */
233
234# define GLOBAL(symbol) .globl symbol
235# define TEXT .text
236# define FILE(filename) .even
237# define invert_maybe(src,dst) src,dst
238# if defined(sun) || defined(mc68k)
239# define imm(data) #data
240# else
241# define imm(data) \#data
242# endif
243# define reg(register) register
244
245# define blos bcss
246# if defined(sun) || defined(mc68k)
247# define movel movl
248# define movew movw
249# define moveb movb
250# endif
251# define IndBase(bd,An) An@(bd)
252# define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
253# define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
254# define predec(An) An@-
255# define postinc(An) An@+
256
257#endif /* styles */
258
259#define Best_Len reg(d0) /* unsigned */
260#define Cur_Match reg(d1) /* Ipos */
261#define Loop_Counter reg(d2) /* int */
262#define Scan_Start reg(d3) /* unsigned short */
263#define Scan_End reg(d4) /* unsigned short */
264#define Limit reg(d5) /* IPos */
265#define Chain_Length reg(d6) /* unsigned */
266#define Scan_Test reg(d7)
267#define Scan reg(a0) /* *uch */
268#define Match reg(a1) /* *uch */
269#define Prev_Address reg(a2) /* *Pos */
270#define Scan_Ini reg(a3) /* *uch */
271#define Match_Ini reg(a4) /* *uch */
272#define Stack_Pointer reg(sp)
273
274#define MAX_MATCH 258
275#define MIN_MATCH 3
276#define WSIZE 32768
277#define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
278
279 GLOBAL (_match_init)
280 GLOBAL (_longest_match)
281
282 TEXT
283
284 FILE ("match.S")
285
286_match_init:
287 rts
288
289/*-----------------------------------------------------------------------
290 * Set match_start to the longest match starting at the given string and
291 * return its length. Matches shorter or equal to prev_length are discarded,
292 * in which case the result is equal to prev_length and match_start is
293 * garbage.
294 * IN assertions: cur_match is the head of the hash chain for the current
295 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
296 */
297
298/* int longest_match (cur_match) */
299
300#ifdef UNALIGNED_OK
301# define pushreg 15928 /* d2-d6/a2-a4 */
302# define popreg 7292
303#else
304# define pushreg 16184 /* d2-d7/a2-a4 */
305# define popreg 7420
306#endif
307
308_longest_match:
309 movel IndBase(4,Stack_Pointer),Cur_Match
310 moveml imm(pushreg),predec(Stack_Pointer)
311 movel _max_chain_length,Chain_Length
312 movel _prev_length,Best_Len
313 movel imm(_prev),Prev_Address
314 movel imm(_window+MIN_MATCH),Match_Ini
315 movel _strstart,Limit
316 movel Match_Ini,Scan_Ini
317 addl Limit,Scan_Ini
318 subw imm(MAX_DIST),Limit
319 bhis L__limit_ok
320 clrl Limit
321L__limit_ok:
322 cmpl invert_maybe(_good_match,Best_Len)
323 blos L__length_ok
324 lsrl imm(2),Chain_Length
325L__length_ok:
326 subql imm(1),Chain_Length
327#ifdef UNALIGNED_OK
328 movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
329 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
330#else
331 moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
332 lslw imm(8),Scan_Start
333 moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
334 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
335 lslw imm(8),Scan_End
336 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
337#endif
338 bras L__do_scan
339
340L__long_loop:
341#ifdef UNALIGNED_OK
342 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
343#else
344 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
345 lslw imm(8),Scan_End
346 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
347#endif
348
349L__short_loop:
350 lslw imm(1),Cur_Match
351 movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
352 cmpw invert_maybe(Limit,Cur_Match)
353 dbls Chain_Length,L__do_scan
354 bras L__return
355
356L__do_scan:
357 movel Match_Ini,Match
358 addl Cur_Match,Match
359#ifdef UNALIGNED_OK
360 cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
361 bne L__short_loop
362 cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
363 bne L__short_loop
364#else
365 moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
366 lslw imm(8),Scan_Test
367 moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
368 cmpw invert_maybe(Scan_Test,Scan_End)
369 bne L__short_loop
370 moveb IndBase(-MIN_MATCH,Match),Scan_Test
371 lslw imm(8),Scan_Test
372 moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
373 cmpw invert_maybe(Scan_Test,Scan_Start)
374 bne L__short_loop
375#endif
376
377 movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
378 movel Scan_Ini,Scan
379L__scan_loop:
380 cmpmb postinc(Match),postinc(Scan)
381 dbne Loop_Counter,L__scan_loop
382
383 subl Scan_Ini,Scan
384 addql imm(MIN_MATCH-1),Scan
385 cmpl invert_maybe(Best_Len,Scan)
386 bls L__short_loop
387 movel Scan,Best_Len
388 movel Cur_Match,_match_start
389 cmpl invert_maybe(_nice_match,Best_Len)
390 blos L__long_loop
391L__return:
392 moveml postinc(Stack_Pointer),imm(popreg)
393 rts
394
395#else
396
397# if defined (__ia64__)
398
399/* ======================== ia64 version ================================= */
400
401/*
402 * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
403 *
404 * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
405 * possible.
406 *
407 * Copyright: Sverre Jarp (HP Labs) 2001-2002
408 *
409 * See deflate.c for c-version
410 * Version 2 - Optimize the outer loop
411 */
412
413#include <endian.h>
414
415#if __BYTE_ORDER == ____BIG_ENDIAN
416#define first shl
417#define second shr.u
418#define count czx1.l
419#else
420#define first shr.u
421#define second shl
422#define count czx1.r
423#endif
424
425// 24 rotating register (r32 - r55)
426
427#define s_vmatch0 r32
428#define s_vmatch1 r33
429#define s_vmatbst r34
430#define s_vmatbst1 r35
431#define s_amatblen r36
432
433#define s_tm1 r56
434#define s_tm2 r57
435#define s_tm3 r58
436#define s_tm4 r59
437#define s_tm5 r60
438#define s_tm6 r61
439#define s_tm7 r62
440#define s_tm8 r63
441
442#define s_vlen r31
443#define s_vstrstart r30
444#define s_vchainlen r29
445#define s_awinbest r28
446#define s_vcurmatch r27
447#define s_vlimit r26
448#define s_vscanend r25
449#define s_vscanend1 r24
450#define s_anicematch r23
451#define s_vscan0 r22
452#define s_vscan1 r21
453
454#define s_aprev r20
455#define s_awindow r19
456#define s_amatchstart r18
457#define s_ascan r17
458#define s_amatch r16
459#define s_wmask r15
460#define s_ascanend r14
461
462#define s_vspec_cmatch r11 // next iteration
463#define s_lcsave r10
464#define s_prsave r9
465#define s_vbestlen r8 // return register
466
467#define s_vscan3 r3
468#define s_vmatch3 r2
469
470#define p_no p2
471#define p_yes p3
472#define p_shf p4 //
473#define p_bn2 p5 // Use in loop (indicating bestlen != 2)
474
475#define p_nbs p9 // not new best_len
476#define p_nnc p10 // not nice_length
477#define p_ll p11
478#define p_end p12
479
480#define MAX_MATCH 258
481#define MIN_MATCH 4
482#define WSIZE 32768
483#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
484
485#define R_INPUT 1
486#define R_LOCAL 31
487#define R_OUTPUT 0
488#define R_ROTATING 24
489#define MLAT 3
490#define SHLAT 2
491
492#define mova mov
493#define movi0 mov
494#define cgtu cmp.gt.unc
495#define cgeu cmp.ge.unc
496#define cneu cmp.ne.unc
497
498 .global longest_match
499 .proc longest_match
500 .align 32
501longest_match:
502// -- Cycle: 0
503 .prologue
504{.mmi
505 alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
506 .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
507 shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
508 .rotp lc[MLAT+SHLAT+2]
509 mova s_vspec_cmatch=in0 // cur_match from input register
510 add s_tm1=@gprel(strstart),gp // a(a(strstart))
511}{.mmi
512 add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
513 add s_tm5=@ltoff(window),gp // a(a(window))
514 add s_tm6=@ltoff(prev),gp // a(a(prev))
515 ;;
516}{.mmb // Cycle: 1
517 ld4 s_vstrstart=[s_tm1] // strstart
518 ld4 s_vbestlen=[s_tm3] // best_len = prev_length
519 brp.loop.imp .cmploop,.cmploop+48
520}{.mli
521 add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
522 movl s_wmask=WSIZE-1
523 ;;
524}{.mmi // Cycle: 2
525 ld8 s_aprev=[s_tm6] // a(prev)
526 ld8 s_awindow=[s_tm5] // a(window)
527 .save pr, s_prsave
528 movi0 s_prsave=pr // save predicates
529}{.mmi
530 add s_tm4=@gprel(good_match),gp // a(a(good_match))
531 add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
532 add s_tm8=@ltoff(match_start),gp // a(match_start)
533 ;;
534}{.mmi // Cycle: 3
535 ld8 s_anicematch=[s_tm7] // a(nice_match)
536 ld8 s_amatchstart=[s_tm8] // a(match_start)
537 .save ar.lc, s_lcsave
538 movi0 s_lcsave=ar.lc // save loop count register
539}{.mmi
540 .body
541 add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
542 cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
543 mova s_vcurmatch=s_vspec_cmatch
544 ;;
545}{.mmi // Cycle: 4
546 ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
547 ld4 s_tm4=[s_tm4] // v(good_match)
548 add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
549}{.mmi
550 sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
551 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
552 and s_vspec_cmatch =s_vspec_cmatch,s_wmask
553 ;;
554}{.mmi // Cycle: 5
555 add s_amatblen=s_amatch,s_vbestlen //
556 cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
557 add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
558}{.mmi
559 ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
560 ld1 s_vmatch0=[s_amatch],1
561 cgtu p0,p_no=s_vlimit,r0 // is result positive ?
562 ;;
563}{.mmi // Cycle: 6
564 ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
565 ld1.nt1 s_vmatch1=[s_amatch],2
566 add s_awinbest=s_awindow,s_vbestlen //
567 ;;
568}{.mmi // Cycle: 7
569 ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
570 ld1.nt1 s_vmatbst=[s_amatblen],-1
571(p_no) mova s_vlimit=r0
572 ;;
573}{.mmi // Cycle: 8
574(p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
575(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
576 shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
577}{.mmi
578 cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
579 ;;
580}{.mmi // Cycle: 9
581 ld1.nt1 s_vscan3=[s_ascan]
582 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
583 mova s_vlen=3
584}{.mmi
585(p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
586 ;;
587}{.mmi // Cycle: 10
588 ld1.nt1 s_vmatch3=[s_amatch]
589 // p_ll switched on as soon as we get a mismatch:
590 cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
591 cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
592}{.mib
593 cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
594(p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
595(p_ll) br.cond.dpnt.many .test_more
596 ;;
597}
598
599.next_iter:
600{.mmi // Cycle 0
601 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
602 mov s_vcurmatch=s_vspec_cmatch // current value
603 add s_vchainlen=-1,s_vchainlen // --chain_length
604}{.mib
605 cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
606 and s_vspec_cmatch=s_vspec_cmatch,s_wmask
607(p_end) br.cond.dptk.many .terminate
608 ;;
609}{.mmi // Cycle 1
610 ld1 s_vmatch0=[s_amatch],1 // load match[0]
611 // compute prev[cur_match]:
612 shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
613 cmp.eq.unc p_end,p0=s_vchainlen,r0
614} {.mib
615 nop.m 0
616 add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
617(p_end) br.cond.dptk.many .terminate
618 ;;
619}{.mmi // Cycle 2 (short)
620 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
621 ;;
622}{.mmi // Cycle 3 (short)
623 ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
624 cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
625 ;;
626}{.mmi // Cycle 4 (short)
627 // load match[1] - - note: match += 3 (in total):
628 ld1.nt1 s_vmatch1=[s_amatch],2
629 ;;
630 // Cycle 5 (short)
631(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
632}{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
633 // p_ll switched on as soon as we get a mismatch:
634 cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
635 cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
636(p_ll) br.cond.dptk.many .next_iter
637 ;;
638}{.mmi // Cycle 6
639 ld1.nt1 s_vmatch3=[s_amatch]
640 mova s_vlen=3
641 nop.i 0
642}{.mib
643 cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
644(p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
645(p_ll) br.cond.dptk.many .next_iter
646 ;;
647}
648
649// We have passed the first hurdle - Are there additional matches ???
650
651.test_more:
652{.mmi // Cycle 0
653 and s_tm3=7,s_ascan // get byte offset
654 and s_tm4=7,s_amatch // get byte offset
655 movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
656}{.mib
657 cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
658(p_no) br.cond.dptk.many .only3
659 ;;
660}{.mmi // Cycle 1
661 and s_tm1=-8,s_ascan // get aligned address
662 shladd s_tm3=s_tm3,3,r0
663 movi0 ar.lc=31 // 32 times around the loop (8B at a time)
664}{.mib
665 and s_tm2=-8,s_amatch // get aligned address
666 shladd s_tm4=s_tm4,3,r0
667 nop.b 0
668 ;;
669}{.mmi // Cycle 2
670 ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
671 sub s_tm5=64,s_tm3 // 64 - amount
672 movi0 pr.rot=1<<16
673}{.mmi
674 ld8.nt1 match[1]=[s_tm2],8 // load first chunk
675 sub s_tm6=64,s_tm4 // 64 - amount
676 add s_vlen=-8,s_vlen // will be updated at least once
677 ;;
678}
679 .align 32
680.cmploop:
681{.mmi // Cycle 0
682(lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
683(lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
684(lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
685}{.mib
686(lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
687(lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
688(p_no) br.cond.dpnt.many .mismatch
689 ;;
690}{.mii // Cycle 1
691(lc[0]) ld8 match[0]=[s_tm2],8
692 // shift left(le) or right(be):
693(lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
694(lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
695}{.mmb
696(lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
697(lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
698 br.ctop.dptk.many .cmploop
699 ;;
700}{.mfi
701 mov s_vlen=258
702 nop.f 0
703}{.mfi
704 nop.f 0 // realign
705 ;;
706}
707.mismatch:
708{.mii // Cycle 0 (short)
709(p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
710 nop.i 0
711 ;;
712 // Cycle 1 (short)
713(p_no) count s_tm1=s_tm2
714 ;;
715}{.mib // Cycle 2 (short)
716(p_no) add s_vlen=s_vlen,s_tm1 // effective length
717 nop.i 0
718 clrrrb
719 ;;
720}
721
722.only3:
723{.mib // Cycle 0 (short)
724 cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
725(p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate
726 ;;
727}{.mmi // Cycle 1 (short)
728 ld4 s_tm7=[s_anicematch] // nice_match
729 st4 [s_amatchstart]= s_vcurmatch
730 add s_ascanend=s_ascan,s_vlen // reset with best_len
731 ;;
732}{.mmi // Cycle 2 (short)
733 mova s_vbestlen=s_vlen
734 add s_ascanend=-3,s_ascanend // remember extra offset
735 ;;
736}{.mmi // Cycle 3 (short)
737 ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
738 add s_awinbest=s_awindow,s_vbestlen // update with new best_len
739 cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
740 ;;
741}{.mib // Cycle 4 (short)
742 // scan_end1=scan[best_len-1] NB: s_ascanend reset:
743 ld1.nt1 s_vscanend1=[s_ascanend],1
744 cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
745(p_nnc) br.cond.dptk.many .next_iter
746 ;;
747}
748.terminate:
749{.mii // Cycle 0/1
750 nop.m 0
751 movi0 ar.lc=s_lcsave
752 movi0 pr=s_prsave,-1
753}{.mbb
754 nop.m 0
755 nop.b 0
756 br.ret.sptk.many rp // ret0 is identical to best_len
757 ;;
758}
759 .endp
760
761 .global match_init
762 .proc match_init
763match_init:
764 sub ret0=ret0,ret0
765 br.ret.sptk.many rp
766 .endp
767
768# else
769 error: this asm version is for 386 or 680x0 or ia64 only
770# endif /* __ia64__ */
771#endif /* mc68000 || mc68020 */
772#endif /* i386 || _I386 */
Note: See TracBrowser for help on using the repository browser.