| 1 | /* match.s -- optional optimized asm version of longest match in deflate.c
|
|---|
| 2 |
|
|---|
| 3 | Copyright (C) 2002, 2006 Free Software Foundation, Inc.
|
|---|
| 4 | Copyright (C) 1992-1993 Jean-loup Gailly
|
|---|
| 5 |
|
|---|
| 6 | This program is free software; you can redistribute it and/or modify
|
|---|
| 7 | it under the terms of the GNU General Public License as published by
|
|---|
| 8 | the Free Software Foundation; either version 2, or (at your option)
|
|---|
| 9 | any later version.
|
|---|
| 10 |
|
|---|
| 11 | This program is distributed in the hope that it will be useful,
|
|---|
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 14 | GNU General Public License for more details.
|
|---|
| 15 |
|
|---|
| 16 | You should have received a copy of the GNU General Public License
|
|---|
| 17 | along with this program; if not, write to the Free Software Foundation,
|
|---|
| 18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
|
|---|
| 19 |
|
|---|
| 20 | /*
|
|---|
| 21 | * The 68020 version has been written by Francesco Potorti` <pot@cnuce.cnr.it>
|
|---|
| 22 | * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
|
|---|
| 23 | * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
|
|---|
| 24 | * Kristoffer Eriksson <ske@pkmab.se>
|
|---|
| 25 | *
|
|---|
| 26 | * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
|
|---|
| 27 | * Unwind directives and some reformatting for better readability added by
|
|---|
| 28 | * David Mosberger-Tang <davidm@hpl.hp.com>.
|
|---|
| 29 | */
|
|---|
| 30 |
|
|---|
| 31 | /* $Id: match.c,v 1.1 2006/11/20 08:40:34 eggert Exp $ */
|
|---|
| 32 |
|
|---|
| 33 | /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
|
|---|
| 34 | * external symbols with an underline character '_'.
|
|---|
| 35 | */
|
|---|
| 36 | #ifdef NO_UNDERLINE
|
|---|
| 37 | # define _prev prev
|
|---|
| 38 | # define _window window
|
|---|
| 39 | # define _match_start match_start
|
|---|
| 40 | # define _prev_length prev_length
|
|---|
| 41 | # define _good_match good_match
|
|---|
| 42 | # define _nice_match nice_match
|
|---|
| 43 | # define _strstart strstart
|
|---|
| 44 | # define _max_chain_length max_chain_length
|
|---|
| 45 |
|
|---|
| 46 | # define _match_init match_init
|
|---|
| 47 | # define _longest_match longest_match
|
|---|
| 48 | #endif
|
|---|
| 49 |
|
|---|
| 50 | #ifdef DYN_ALLOC
|
|---|
| 51 | error: DYN_ALLOC not yet supported in match.s
|
|---|
| 52 | #endif
|
|---|
| 53 |
|
|---|
| 54 | #if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
|
|---|
| 55 |
|
|---|
| 56 | /* This version is for 386 Unix or OS/2 in 32 bit mode.
|
|---|
| 57 | * Warning: it uses the AT&T syntax: mov source,dest
|
|---|
| 58 | * This file is only optional. If you want to force the C version,
|
|---|
| 59 | * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
|
|---|
| 60 | * If you have reduced WSIZE in gzip.h, then change its value below.
|
|---|
| 61 | * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
|
|---|
| 62 | */
|
|---|
| 63 |
|
|---|
| 64 | .file "match.S"
|
|---|
| 65 |
|
|---|
| 66 | #define MAX_MATCH 258
|
|---|
| 67 | #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
|
|---|
| 68 | #define MIN_MATCH 3
|
|---|
| 69 | #define WSIZE $32768
|
|---|
| 70 | #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
|---|
| 71 |
|
|---|
| 72 | .globl _match_init
|
|---|
| 73 | .globl _longest_match
|
|---|
| 74 |
|
|---|
| 75 | .text
|
|---|
| 76 |
|
|---|
| 77 | _match_init:
|
|---|
| 78 | ret
|
|---|
| 79 |
|
|---|
| 80 | /*-----------------------------------------------------------------------
|
|---|
| 81 | * Set match_start to the longest match starting at the given string and
|
|---|
| 82 | * return its length. Matches shorter or equal to prev_length are discarded,
|
|---|
| 83 | * in which case the result is equal to prev_length and match_start is
|
|---|
| 84 | * garbage.
|
|---|
| 85 | * IN assertions: cur_match is the head of the hash chain for the current
|
|---|
| 86 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
|---|
| 87 | */
|
|---|
| 88 |
|
|---|
| 89 | _longest_match: /* int longest_match(cur_match) */
|
|---|
| 90 |
|
|---|
| 91 | #define cur_match 20(%esp)
|
|---|
| 92 | /* return address */ /* esp+16 */
|
|---|
| 93 | push %ebp /* esp+12 */
|
|---|
| 94 | push %edi /* esp+8 */
|
|---|
| 95 | push %esi /* esp+4 */
|
|---|
| 96 | push %ebx /* esp */
|
|---|
| 97 |
|
|---|
| 98 | /*
|
|---|
| 99 | * match equ esi
|
|---|
| 100 | * scan equ edi
|
|---|
| 101 | * chain_length equ ebp
|
|---|
| 102 | * best_len equ ebx
|
|---|
| 103 | * limit equ edx
|
|---|
| 104 | */
|
|---|
| 105 | mov cur_match,%esi
|
|---|
| 106 | mov _max_chain_length,%ebp /* chain_length = max_chain_length */
|
|---|
| 107 | mov _strstart,%edi
|
|---|
| 108 | mov %edi,%edx
|
|---|
| 109 | sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
|
|---|
| 110 | jae limit_ok
|
|---|
| 111 | sub %edx,%edx /* limit = NIL */
|
|---|
| 112 | limit_ok:
|
|---|
| 113 | add $2+_window,%edi /* edi = offset(window+strstart+2) */
|
|---|
| 114 | mov _prev_length,%ebx /* best_len = prev_length */
|
|---|
| 115 | movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
|---|
| 116 | movw -2(%edi),%cx /* cx = scan[0..1] */
|
|---|
| 117 | cmp _good_match,%ebx /* do we have a good match already? */
|
|---|
| 118 | jb do_scan
|
|---|
| 119 | shr $2,%ebp /* chain_length >>= 2 */
|
|---|
| 120 | jmp do_scan
|
|---|
| 121 |
|
|---|
| 122 | .align 4
|
|---|
| 123 | long_loop:
|
|---|
| 124 | /* at this point, edi == scan+2, esi == cur_match */
|
|---|
| 125 | movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
|---|
| 126 | movw -2(%edi),%cx /* cx = scan[0..1] */
|
|---|
| 127 | short_loop:
|
|---|
| 128 | /*
|
|---|
| 129 | * at this point, di == scan+2, si == cur_match,
|
|---|
| 130 | * ax = scan[best_len-1..best_len] and cx = scan[0..1]
|
|---|
| 131 | */
|
|---|
| 132 | and WSIZE-1, %esi
|
|---|
| 133 | movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
|
|---|
| 134 | /* top word of esi is still 0 */
|
|---|
| 135 | cmp %edx,%esi /* cur_match <= limit ? */
|
|---|
| 136 | jbe the_end
|
|---|
| 137 | dec %ebp /* --chain_length */
|
|---|
| 138 | jz the_end
|
|---|
| 139 | do_scan:
|
|---|
| 140 | cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
|
|---|
| 141 | jne short_loop
|
|---|
| 142 | cmpw _window(%esi),%cx /* check min_match_length match */
|
|---|
| 143 | jne short_loop
|
|---|
| 144 |
|
|---|
| 145 | lea _window+2(%esi),%esi /* si = match */
|
|---|
| 146 | mov %edi,%eax /* ax = scan+2 */
|
|---|
| 147 | mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
|
|---|
| 148 | rep; cmpsw /* loop until mismatch */
|
|---|
| 149 | je maxmatch /* match of length MAX_MATCH? */
|
|---|
| 150 | mismatch:
|
|---|
| 151 | movb -2(%edi),%cl /* mismatch on first or second byte? */
|
|---|
| 152 | subb -2(%esi),%cl /* cl = 0 if first bytes equal */
|
|---|
| 153 | xchg %edi,%eax /* edi = scan+2, eax = end of scan */
|
|---|
| 154 | sub %edi,%eax /* eax = len */
|
|---|
| 155 | sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
|
|---|
| 156 | sub $2+_window,%esi /* esi = cur_match */
|
|---|
| 157 | subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
|
|---|
| 158 | adc $0,%eax /* eax = carry ? len+1 : len */
|
|---|
| 159 | cmp %ebx,%eax /* len > best_len ? */
|
|---|
| 160 | jle long_loop
|
|---|
| 161 | mov %esi,_match_start /* match_start = cur_match */
|
|---|
| 162 | mov %eax,%ebx /* ebx = best_len = len */
|
|---|
| 163 | cmp _nice_match,%eax /* len >= nice_match ? */
|
|---|
| 164 | jl long_loop
|
|---|
| 165 | the_end:
|
|---|
| 166 | mov %ebx,%eax /* result = eax = best_len */
|
|---|
| 167 | pop %ebx
|
|---|
| 168 | pop %esi
|
|---|
| 169 | pop %edi
|
|---|
| 170 | pop %ebp
|
|---|
| 171 | ret
|
|---|
| 172 | maxmatch:
|
|---|
| 173 | cmpsb
|
|---|
| 174 | jmp mismatch
|
|---|
| 175 |
|
|---|
| 176 | #else
|
|---|
| 177 |
|
|---|
| 178 | /* ======================== 680x0 version ================================= */
|
|---|
| 179 |
|
|---|
| 180 | #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
|
|---|
| 181 | # ifndef mc68000
|
|---|
| 182 | # define mc68000
|
|---|
| 183 | # endif
|
|---|
| 184 | #endif
|
|---|
| 185 |
|
|---|
| 186 | #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
|
|---|
| 187 | # ifndef mc68020
|
|---|
| 188 | # define mc68020
|
|---|
| 189 | # endif
|
|---|
| 190 | #endif
|
|---|
| 191 |
|
|---|
| 192 | #if defined(mc68020) || defined(mc68000)
|
|---|
| 193 |
|
|---|
| 194 | #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
|
|---|
| 195 | # define UNALIGNED_OK
|
|---|
| 196 | #endif
|
|---|
| 197 |
|
|---|
| 198 | #ifdef sysV68 /* Try Motorola Delta style */
|
|---|
| 199 |
|
|---|
| 200 | # define GLOBAL(symbol) global symbol
|
|---|
| 201 | # define TEXT text
|
|---|
| 202 | # define FILE(filename) file filename
|
|---|
| 203 | # define invert_maybe(src,dst) dst,src
|
|---|
| 204 | # define imm(data) &data
|
|---|
| 205 | # define reg(register) %register
|
|---|
| 206 |
|
|---|
| 207 | # define addl add.l
|
|---|
| 208 | # define addql addq.l
|
|---|
| 209 | # define blos blo.b
|
|---|
| 210 | # define bhis bhi.b
|
|---|
| 211 | # define bras bra.b
|
|---|
| 212 | # define clrl clr.l
|
|---|
| 213 | # define cmpmb cmpm.b
|
|---|
| 214 | # define cmpw cmp.w
|
|---|
| 215 | # define cmpl cmp.l
|
|---|
| 216 | # define lslw lsl.w
|
|---|
| 217 | # define lsrl lsr.l
|
|---|
| 218 | # define movel move.l
|
|---|
| 219 | # define movew move.w
|
|---|
| 220 | # define moveb move.b
|
|---|
| 221 | # define moveml movem.l
|
|---|
| 222 | # define subl sub.l
|
|---|
| 223 | # define subw sub.w
|
|---|
| 224 | # define subql subq.l
|
|---|
| 225 |
|
|---|
| 226 | # define IndBase(bd,An) (bd,An)
|
|---|
| 227 | # define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
|
|---|
| 228 | # define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
|
|---|
| 229 | # define predec(An) -(An)
|
|---|
| 230 | # define postinc(An) (An)+
|
|---|
| 231 |
|
|---|
| 232 | #else /* default style (Sun 3, NeXT, Amiga, Atari) */
|
|---|
| 233 |
|
|---|
| 234 | # define GLOBAL(symbol) .globl symbol
|
|---|
| 235 | # define TEXT .text
|
|---|
| 236 | # define FILE(filename) .even
|
|---|
| 237 | # define invert_maybe(src,dst) src,dst
|
|---|
| 238 | # if defined(sun) || defined(mc68k)
|
|---|
| 239 | # define imm(data) #data
|
|---|
| 240 | # else
|
|---|
| 241 | # define imm(data) \#data
|
|---|
| 242 | # endif
|
|---|
| 243 | # define reg(register) register
|
|---|
| 244 |
|
|---|
| 245 | # define blos bcss
|
|---|
| 246 | # if defined(sun) || defined(mc68k)
|
|---|
| 247 | # define movel movl
|
|---|
| 248 | # define movew movw
|
|---|
| 249 | # define moveb movb
|
|---|
| 250 | # endif
|
|---|
| 251 | # define IndBase(bd,An) An@(bd)
|
|---|
| 252 | # define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
|
|---|
| 253 | # define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
|
|---|
| 254 | # define predec(An) An@-
|
|---|
| 255 | # define postinc(An) An@+
|
|---|
| 256 |
|
|---|
| 257 | #endif /* styles */
|
|---|
| 258 |
|
|---|
| 259 | #define Best_Len reg(d0) /* unsigned */
|
|---|
| 260 | #define Cur_Match reg(d1) /* Ipos */
|
|---|
| 261 | #define Loop_Counter reg(d2) /* int */
|
|---|
| 262 | #define Scan_Start reg(d3) /* unsigned short */
|
|---|
| 263 | #define Scan_End reg(d4) /* unsigned short */
|
|---|
| 264 | #define Limit reg(d5) /* IPos */
|
|---|
| 265 | #define Chain_Length reg(d6) /* unsigned */
|
|---|
| 266 | #define Scan_Test reg(d7)
|
|---|
| 267 | #define Scan reg(a0) /* *uch */
|
|---|
| 268 | #define Match reg(a1) /* *uch */
|
|---|
| 269 | #define Prev_Address reg(a2) /* *Pos */
|
|---|
| 270 | #define Scan_Ini reg(a3) /* *uch */
|
|---|
| 271 | #define Match_Ini reg(a4) /* *uch */
|
|---|
| 272 | #define Stack_Pointer reg(sp)
|
|---|
| 273 |
|
|---|
| 274 | #define MAX_MATCH 258
|
|---|
| 275 | #define MIN_MATCH 3
|
|---|
| 276 | #define WSIZE 32768
|
|---|
| 277 | #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
|
|---|
| 278 |
|
|---|
| 279 | GLOBAL (_match_init)
|
|---|
| 280 | GLOBAL (_longest_match)
|
|---|
| 281 |
|
|---|
| 282 | TEXT
|
|---|
| 283 |
|
|---|
| 284 | FILE ("match.S")
|
|---|
| 285 |
|
|---|
| 286 | _match_init:
|
|---|
| 287 | rts
|
|---|
| 288 |
|
|---|
| 289 | /*-----------------------------------------------------------------------
|
|---|
| 290 | * Set match_start to the longest match starting at the given string and
|
|---|
| 291 | * return its length. Matches shorter or equal to prev_length are discarded,
|
|---|
| 292 | * in which case the result is equal to prev_length and match_start is
|
|---|
| 293 | * garbage.
|
|---|
| 294 | * IN assertions: cur_match is the head of the hash chain for the current
|
|---|
| 295 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
|---|
| 296 | */
|
|---|
| 297 |
|
|---|
| 298 | /* int longest_match (cur_match) */
|
|---|
| 299 |
|
|---|
| 300 | #ifdef UNALIGNED_OK
|
|---|
| 301 | # define pushreg 15928 /* d2-d6/a2-a4 */
|
|---|
| 302 | # define popreg 7292
|
|---|
| 303 | #else
|
|---|
| 304 | # define pushreg 16184 /* d2-d7/a2-a4 */
|
|---|
| 305 | # define popreg 7420
|
|---|
| 306 | #endif
|
|---|
| 307 |
|
|---|
| 308 | _longest_match:
|
|---|
| 309 | movel IndBase(4,Stack_Pointer),Cur_Match
|
|---|
| 310 | moveml imm(pushreg),predec(Stack_Pointer)
|
|---|
| 311 | movel _max_chain_length,Chain_Length
|
|---|
| 312 | movel _prev_length,Best_Len
|
|---|
| 313 | movel imm(_prev),Prev_Address
|
|---|
| 314 | movel imm(_window+MIN_MATCH),Match_Ini
|
|---|
| 315 | movel _strstart,Limit
|
|---|
| 316 | movel Match_Ini,Scan_Ini
|
|---|
| 317 | addl Limit,Scan_Ini
|
|---|
| 318 | subw imm(MAX_DIST),Limit
|
|---|
| 319 | bhis L__limit_ok
|
|---|
| 320 | clrl Limit
|
|---|
| 321 | L__limit_ok:
|
|---|
| 322 | cmpl invert_maybe(_good_match,Best_Len)
|
|---|
| 323 | blos L__length_ok
|
|---|
| 324 | lsrl imm(2),Chain_Length
|
|---|
| 325 | L__length_ok:
|
|---|
| 326 | subql imm(1),Chain_Length
|
|---|
| 327 | #ifdef UNALIGNED_OK
|
|---|
| 328 | movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
|---|
| 329 | movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|---|
| 330 | #else
|
|---|
| 331 | moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
|---|
| 332 | lslw imm(8),Scan_Start
|
|---|
| 333 | moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
|
|---|
| 334 | moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|---|
| 335 | lslw imm(8),Scan_End
|
|---|
| 336 | moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
|---|
| 337 | #endif
|
|---|
| 338 | bras L__do_scan
|
|---|
| 339 |
|
|---|
| 340 | L__long_loop:
|
|---|
| 341 | #ifdef UNALIGNED_OK
|
|---|
| 342 | movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|---|
| 343 | #else
|
|---|
| 344 | moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|---|
| 345 | lslw imm(8),Scan_End
|
|---|
| 346 | moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
|---|
| 347 | #endif
|
|---|
| 348 |
|
|---|
| 349 | L__short_loop:
|
|---|
| 350 | lslw imm(1),Cur_Match
|
|---|
| 351 | movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
|
|---|
| 352 | cmpw invert_maybe(Limit,Cur_Match)
|
|---|
| 353 | dbls Chain_Length,L__do_scan
|
|---|
| 354 | bras L__return
|
|---|
| 355 |
|
|---|
| 356 | L__do_scan:
|
|---|
| 357 | movel Match_Ini,Match
|
|---|
| 358 | addl Cur_Match,Match
|
|---|
| 359 | #ifdef UNALIGNED_OK
|
|---|
| 360 | cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
|
|---|
| 361 | bne L__short_loop
|
|---|
| 362 | cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
|
|---|
| 363 | bne L__short_loop
|
|---|
| 364 | #else
|
|---|
| 365 | moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
|
|---|
| 366 | lslw imm(8),Scan_Test
|
|---|
| 367 | moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
|
|---|
| 368 | cmpw invert_maybe(Scan_Test,Scan_End)
|
|---|
| 369 | bne L__short_loop
|
|---|
| 370 | moveb IndBase(-MIN_MATCH,Match),Scan_Test
|
|---|
| 371 | lslw imm(8),Scan_Test
|
|---|
| 372 | moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
|
|---|
| 373 | cmpw invert_maybe(Scan_Test,Scan_Start)
|
|---|
| 374 | bne L__short_loop
|
|---|
| 375 | #endif
|
|---|
| 376 |
|
|---|
| 377 | movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
|
|---|
| 378 | movel Scan_Ini,Scan
|
|---|
| 379 | L__scan_loop:
|
|---|
| 380 | cmpmb postinc(Match),postinc(Scan)
|
|---|
| 381 | dbne Loop_Counter,L__scan_loop
|
|---|
| 382 |
|
|---|
| 383 | subl Scan_Ini,Scan
|
|---|
| 384 | addql imm(MIN_MATCH-1),Scan
|
|---|
| 385 | cmpl invert_maybe(Best_Len,Scan)
|
|---|
| 386 | bls L__short_loop
|
|---|
| 387 | movel Scan,Best_Len
|
|---|
| 388 | movel Cur_Match,_match_start
|
|---|
| 389 | cmpl invert_maybe(_nice_match,Best_Len)
|
|---|
| 390 | blos L__long_loop
|
|---|
| 391 | L__return:
|
|---|
| 392 | moveml postinc(Stack_Pointer),imm(popreg)
|
|---|
| 393 | rts
|
|---|
| 394 |
|
|---|
| 395 | #else
|
|---|
| 396 |
|
|---|
| 397 | # if defined (__ia64__)
|
|---|
| 398 |
|
|---|
| 399 | /* ======================== ia64 version ================================= */
|
|---|
| 400 |
|
|---|
| 401 | /*
|
|---|
| 402 | * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
|
|---|
| 403 | *
|
|---|
| 404 | * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
|
|---|
| 405 | * possible.
|
|---|
| 406 | *
|
|---|
| 407 | * Copyright: Sverre Jarp (HP Labs) 2001-2002
|
|---|
| 408 | *
|
|---|
| 409 | * See deflate.c for c-version
|
|---|
| 410 | * Version 2 - Optimize the outer loop
|
|---|
| 411 | */
|
|---|
| 412 |
|
|---|
| 413 | #include <endian.h>
|
|---|
| 414 |
|
|---|
| 415 | #if __BYTE_ORDER == ____BIG_ENDIAN
|
|---|
| 416 | #define first shl
|
|---|
| 417 | #define second shr.u
|
|---|
| 418 | #define count czx1.l
|
|---|
| 419 | #else
|
|---|
| 420 | #define first shr.u
|
|---|
| 421 | #define second shl
|
|---|
| 422 | #define count czx1.r
|
|---|
| 423 | #endif
|
|---|
| 424 |
|
|---|
| 425 | // 24 rotating register (r32 - r55)
|
|---|
| 426 |
|
|---|
| 427 | #define s_vmatch0 r32
|
|---|
| 428 | #define s_vmatch1 r33
|
|---|
| 429 | #define s_vmatbst r34
|
|---|
| 430 | #define s_vmatbst1 r35
|
|---|
| 431 | #define s_amatblen r36
|
|---|
| 432 |
|
|---|
| 433 | #define s_tm1 r56
|
|---|
| 434 | #define s_tm2 r57
|
|---|
| 435 | #define s_tm3 r58
|
|---|
| 436 | #define s_tm4 r59
|
|---|
| 437 | #define s_tm5 r60
|
|---|
| 438 | #define s_tm6 r61
|
|---|
| 439 | #define s_tm7 r62
|
|---|
| 440 | #define s_tm8 r63
|
|---|
| 441 |
|
|---|
| 442 | #define s_vlen r31
|
|---|
| 443 | #define s_vstrstart r30
|
|---|
| 444 | #define s_vchainlen r29
|
|---|
| 445 | #define s_awinbest r28
|
|---|
| 446 | #define s_vcurmatch r27
|
|---|
| 447 | #define s_vlimit r26
|
|---|
| 448 | #define s_vscanend r25
|
|---|
| 449 | #define s_vscanend1 r24
|
|---|
| 450 | #define s_anicematch r23
|
|---|
| 451 | #define s_vscan0 r22
|
|---|
| 452 | #define s_vscan1 r21
|
|---|
| 453 |
|
|---|
| 454 | #define s_aprev r20
|
|---|
| 455 | #define s_awindow r19
|
|---|
| 456 | #define s_amatchstart r18
|
|---|
| 457 | #define s_ascan r17
|
|---|
| 458 | #define s_amatch r16
|
|---|
| 459 | #define s_wmask r15
|
|---|
| 460 | #define s_ascanend r14
|
|---|
| 461 |
|
|---|
| 462 | #define s_vspec_cmatch r11 // next iteration
|
|---|
| 463 | #define s_lcsave r10
|
|---|
| 464 | #define s_prsave r9
|
|---|
| 465 | #define s_vbestlen r8 // return register
|
|---|
| 466 |
|
|---|
| 467 | #define s_vscan3 r3
|
|---|
| 468 | #define s_vmatch3 r2
|
|---|
| 469 |
|
|---|
| 470 | #define p_no p2
|
|---|
| 471 | #define p_yes p3
|
|---|
| 472 | #define p_shf p4 //
|
|---|
| 473 | #define p_bn2 p5 // Use in loop (indicating bestlen != 2)
|
|---|
| 474 |
|
|---|
| 475 | #define p_nbs p9 // not new best_len
|
|---|
| 476 | #define p_nnc p10 // not nice_length
|
|---|
| 477 | #define p_ll p11
|
|---|
| 478 | #define p_end p12
|
|---|
| 479 |
|
|---|
| 480 | #define MAX_MATCH 258
|
|---|
| 481 | #define MIN_MATCH 4
|
|---|
| 482 | #define WSIZE 32768
|
|---|
| 483 | #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
|---|
| 484 |
|
|---|
| 485 | #define R_INPUT 1
|
|---|
| 486 | #define R_LOCAL 31
|
|---|
| 487 | #define R_OUTPUT 0
|
|---|
| 488 | #define R_ROTATING 24
|
|---|
| 489 | #define MLAT 3
|
|---|
| 490 | #define SHLAT 2
|
|---|
| 491 |
|
|---|
| 492 | #define mova mov
|
|---|
| 493 | #define movi0 mov
|
|---|
| 494 | #define cgtu cmp.gt.unc
|
|---|
| 495 | #define cgeu cmp.ge.unc
|
|---|
| 496 | #define cneu cmp.ne.unc
|
|---|
| 497 |
|
|---|
| 498 | .global longest_match
|
|---|
| 499 | .proc longest_match
|
|---|
| 500 | .align 32
|
|---|
| 501 | longest_match:
|
|---|
| 502 | // -- Cycle: 0
|
|---|
| 503 | .prologue
|
|---|
| 504 | {.mmi
|
|---|
| 505 | alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
|
|---|
| 506 | .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
|
|---|
| 507 | shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
|
|---|
| 508 | .rotp lc[MLAT+SHLAT+2]
|
|---|
| 509 | mova s_vspec_cmatch=in0 // cur_match from input register
|
|---|
| 510 | add s_tm1=@gprel(strstart),gp // a(a(strstart))
|
|---|
| 511 | }{.mmi
|
|---|
| 512 | add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
|
|---|
| 513 | add s_tm5=@ltoff(window),gp // a(a(window))
|
|---|
| 514 | add s_tm6=@ltoff(prev),gp // a(a(prev))
|
|---|
| 515 | ;;
|
|---|
| 516 | }{.mmb // Cycle: 1
|
|---|
| 517 | ld4 s_vstrstart=[s_tm1] // strstart
|
|---|
| 518 | ld4 s_vbestlen=[s_tm3] // best_len = prev_length
|
|---|
| 519 | brp.loop.imp .cmploop,.cmploop+48
|
|---|
| 520 | }{.mli
|
|---|
| 521 | add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
|
|---|
| 522 | movl s_wmask=WSIZE-1
|
|---|
| 523 | ;;
|
|---|
| 524 | }{.mmi // Cycle: 2
|
|---|
| 525 | ld8 s_aprev=[s_tm6] // a(prev)
|
|---|
| 526 | ld8 s_awindow=[s_tm5] // a(window)
|
|---|
| 527 | .save pr, s_prsave
|
|---|
| 528 | movi0 s_prsave=pr // save predicates
|
|---|
| 529 | }{.mmi
|
|---|
| 530 | add s_tm4=@gprel(good_match),gp // a(a(good_match))
|
|---|
| 531 | add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
|
|---|
| 532 | add s_tm8=@ltoff(match_start),gp // a(match_start)
|
|---|
| 533 | ;;
|
|---|
| 534 | }{.mmi // Cycle: 3
|
|---|
| 535 | ld8 s_anicematch=[s_tm7] // a(nice_match)
|
|---|
| 536 | ld8 s_amatchstart=[s_tm8] // a(match_start)
|
|---|
| 537 | .save ar.lc, s_lcsave
|
|---|
| 538 | movi0 s_lcsave=ar.lc // save loop count register
|
|---|
| 539 | }{.mmi
|
|---|
| 540 | .body
|
|---|
| 541 | add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
|
|---|
| 542 | cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
|
|---|
| 543 | mova s_vcurmatch=s_vspec_cmatch
|
|---|
| 544 | ;;
|
|---|
| 545 | }{.mmi // Cycle: 4
|
|---|
| 546 | ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
|
|---|
| 547 | ld4 s_tm4=[s_tm4] // v(good_match)
|
|---|
| 548 | add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
|
|---|
| 549 | }{.mmi
|
|---|
| 550 | sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
|
|---|
| 551 | add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
|---|
| 552 | and s_vspec_cmatch =s_vspec_cmatch,s_wmask
|
|---|
| 553 | ;;
|
|---|
| 554 | }{.mmi // Cycle: 5
|
|---|
| 555 | add s_amatblen=s_amatch,s_vbestlen //
|
|---|
| 556 | cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
|---|
| 557 | add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
|
|---|
| 558 | }{.mmi
|
|---|
| 559 | ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
|
|---|
| 560 | ld1 s_vmatch0=[s_amatch],1
|
|---|
| 561 | cgtu p0,p_no=s_vlimit,r0 // is result positive ?
|
|---|
| 562 | ;;
|
|---|
| 563 | }{.mmi // Cycle: 6
|
|---|
| 564 | ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
|
|---|
| 565 | ld1.nt1 s_vmatch1=[s_amatch],2
|
|---|
| 566 | add s_awinbest=s_awindow,s_vbestlen //
|
|---|
| 567 | ;;
|
|---|
| 568 | }{.mmi // Cycle: 7
|
|---|
| 569 | ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
|---|
| 570 | ld1.nt1 s_vmatbst=[s_amatblen],-1
|
|---|
| 571 | (p_no) mova s_vlimit=r0
|
|---|
| 572 | ;;
|
|---|
| 573 | }{.mmi // Cycle: 8
|
|---|
| 574 | (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
|
|---|
| 575 | (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
|
|---|
| 576 | shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
|
|---|
| 577 | }{.mmi
|
|---|
| 578 | cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
|
|---|
| 579 | ;;
|
|---|
| 580 | }{.mmi // Cycle: 9
|
|---|
| 581 | ld1.nt1 s_vscan3=[s_ascan]
|
|---|
| 582 | ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
|
|---|
| 583 | mova s_vlen=3
|
|---|
| 584 | }{.mmi
|
|---|
| 585 | (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
|
|---|
| 586 | ;;
|
|---|
| 587 | }{.mmi // Cycle: 10
|
|---|
| 588 | ld1.nt1 s_vmatch3=[s_amatch]
|
|---|
| 589 | // p_ll switched on as soon as we get a mismatch:
|
|---|
| 590 | cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
|
|---|
| 591 | cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
|
|---|
| 592 | }{.mib
|
|---|
| 593 | cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
|
|---|
| 594 | (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
|
|---|
| 595 | (p_ll) br.cond.dpnt.many .test_more
|
|---|
| 596 | ;;
|
|---|
| 597 | }
|
|---|
| 598 |
|
|---|
| 599 | .next_iter:
|
|---|
| 600 | {.mmi // Cycle 0
|
|---|
| 601 | add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
|---|
| 602 | mov s_vcurmatch=s_vspec_cmatch // current value
|
|---|
| 603 | add s_vchainlen=-1,s_vchainlen // --chain_length
|
|---|
| 604 | }{.mib
|
|---|
| 605 | cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
|
|---|
| 606 | and s_vspec_cmatch=s_vspec_cmatch,s_wmask
|
|---|
| 607 | (p_end) br.cond.dptk.many .terminate
|
|---|
| 608 | ;;
|
|---|
| 609 | }{.mmi // Cycle 1
|
|---|
| 610 | ld1 s_vmatch0=[s_amatch],1 // load match[0]
|
|---|
| 611 | // compute prev[cur_match]:
|
|---|
| 612 | shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
|
|---|
| 613 | cmp.eq.unc p_end,p0=s_vchainlen,r0
|
|---|
| 614 | } {.mib
|
|---|
| 615 | nop.m 0
|
|---|
| 616 | add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
|
|---|
| 617 | (p_end) br.cond.dptk.many .terminate
|
|---|
| 618 | ;;
|
|---|
| 619 | }{.mmi // Cycle 2 (short)
|
|---|
| 620 | ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
|
|---|
| 621 | ;;
|
|---|
| 622 | }{.mmi // Cycle 3 (short)
|
|---|
| 623 | ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
|
|---|
| 624 | cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
|
|---|
| 625 | ;;
|
|---|
| 626 | }{.mmi // Cycle 4 (short)
|
|---|
| 627 | // load match[1] - - note: match += 3 (in total):
|
|---|
| 628 | ld1.nt1 s_vmatch1=[s_amatch],2
|
|---|
| 629 | ;;
|
|---|
| 630 | // Cycle 5 (short)
|
|---|
| 631 | (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
|
|---|
| 632 | }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
|
|---|
| 633 | // p_ll switched on as soon as we get a mismatch:
|
|---|
| 634 | cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
|
|---|
| 635 | cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
|
|---|
| 636 | (p_ll) br.cond.dptk.many .next_iter
|
|---|
| 637 | ;;
|
|---|
| 638 | }{.mmi // Cycle 6
|
|---|
| 639 | ld1.nt1 s_vmatch3=[s_amatch]
|
|---|
| 640 | mova s_vlen=3
|
|---|
| 641 | nop.i 0
|
|---|
| 642 | }{.mib
|
|---|
| 643 | cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
|
|---|
| 644 | (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
|
|---|
| 645 | (p_ll) br.cond.dptk.many .next_iter
|
|---|
| 646 | ;;
|
|---|
| 647 | }
|
|---|
| 648 |
|
|---|
| 649 | // We have passed the first hurdle - Are there additional matches ???
|
|---|
| 650 |
|
|---|
| 651 | .test_more:
|
|---|
| 652 | {.mmi // Cycle 0
|
|---|
| 653 | and s_tm3=7,s_ascan // get byte offset
|
|---|
| 654 | and s_tm4=7,s_amatch // get byte offset
|
|---|
| 655 | movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
|
|---|
| 656 | }{.mib
|
|---|
| 657 | cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
|
|---|
| 658 | (p_no) br.cond.dptk.many .only3
|
|---|
| 659 | ;;
|
|---|
| 660 | }{.mmi // Cycle 1
|
|---|
| 661 | and s_tm1=-8,s_ascan // get aligned address
|
|---|
| 662 | shladd s_tm3=s_tm3,3,r0
|
|---|
| 663 | movi0 ar.lc=31 // 32 times around the loop (8B at a time)
|
|---|
| 664 | }{.mib
|
|---|
| 665 | and s_tm2=-8,s_amatch // get aligned address
|
|---|
| 666 | shladd s_tm4=s_tm4,3,r0
|
|---|
| 667 | nop.b 0
|
|---|
| 668 | ;;
|
|---|
| 669 | }{.mmi // Cycle 2
|
|---|
| 670 | ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
|
|---|
| 671 | sub s_tm5=64,s_tm3 // 64 - amount
|
|---|
| 672 | movi0 pr.rot=1<<16
|
|---|
| 673 | }{.mmi
|
|---|
| 674 | ld8.nt1 match[1]=[s_tm2],8 // load first chunk
|
|---|
| 675 | sub s_tm6=64,s_tm4 // 64 - amount
|
|---|
| 676 | add s_vlen=-8,s_vlen // will be updated at least once
|
|---|
| 677 | ;;
|
|---|
| 678 | }
|
|---|
| 679 | .align 32
|
|---|
| 680 | .cmploop:
|
|---|
| 681 | {.mmi // Cycle 0
|
|---|
| 682 | (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
|
|---|
| 683 | (lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
|
|---|
| 684 | (lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
|
|---|
| 685 | }{.mib
|
|---|
| 686 | (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
|
|---|
| 687 | (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
|
|---|
| 688 | (p_no) br.cond.dpnt.many .mismatch
|
|---|
| 689 | ;;
|
|---|
| 690 | }{.mii // Cycle 1
|
|---|
| 691 | (lc[0]) ld8 match[0]=[s_tm2],8
|
|---|
| 692 | // shift left(le) or right(be):
|
|---|
| 693 | (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
|
|---|
| 694 | (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
|
|---|
| 695 | }{.mmb
|
|---|
| 696 | (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
|
|---|
| 697 | (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
|
|---|
| 698 | br.ctop.dptk.many .cmploop
|
|---|
| 699 | ;;
|
|---|
| 700 | }{.mfi
|
|---|
| 701 | mov s_vlen=258
|
|---|
| 702 | nop.f 0
|
|---|
| 703 | }{.mfi
|
|---|
| 704 | nop.f 0 // realign
|
|---|
| 705 | ;;
|
|---|
| 706 | }
|
|---|
| 707 | .mismatch:
|
|---|
| 708 | {.mii // Cycle 0 (short)
|
|---|
| 709 | (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
|
|---|
| 710 | nop.i 0
|
|---|
| 711 | ;;
|
|---|
| 712 | // Cycle 1 (short)
|
|---|
| 713 | (p_no) count s_tm1=s_tm2
|
|---|
| 714 | ;;
|
|---|
| 715 | }{.mib // Cycle 2 (short)
|
|---|
| 716 | (p_no) add s_vlen=s_vlen,s_tm1 // effective length
|
|---|
| 717 | nop.i 0
|
|---|
| 718 | clrrrb
|
|---|
| 719 | ;;
|
|---|
| 720 | }
|
|---|
| 721 |
|
|---|
| 722 | .only3:
|
|---|
| 723 | {.mib // Cycle 0 (short)
|
|---|
| 724 | cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
|
|---|
| 725 | (p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate
|
|---|
| 726 | ;;
|
|---|
| 727 | }{.mmi // Cycle 1 (short)
|
|---|
| 728 | ld4 s_tm7=[s_anicematch] // nice_match
|
|---|
| 729 | st4 [s_amatchstart]= s_vcurmatch
|
|---|
| 730 | add s_ascanend=s_ascan,s_vlen // reset with best_len
|
|---|
| 731 | ;;
|
|---|
| 732 | }{.mmi // Cycle 2 (short)
|
|---|
| 733 | mova s_vbestlen=s_vlen
|
|---|
| 734 | add s_ascanend=-3,s_ascanend // remember extra offset
|
|---|
| 735 | ;;
|
|---|
| 736 | }{.mmi // Cycle 3 (short)
|
|---|
| 737 | ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
|---|
| 738 | add s_awinbest=s_awindow,s_vbestlen // update with new best_len
|
|---|
| 739 | cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
|---|
| 740 | ;;
|
|---|
| 741 | }{.mib // Cycle 4 (short)
|
|---|
| 742 | // scan_end1=scan[best_len-1] NB: s_ascanend reset:
|
|---|
| 743 | ld1.nt1 s_vscanend1=[s_ascanend],1
|
|---|
| 744 | cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
|
|---|
| 745 | (p_nnc) br.cond.dptk.many .next_iter
|
|---|
| 746 | ;;
|
|---|
| 747 | }
|
|---|
| 748 | .terminate:
|
|---|
| 749 | {.mii // Cycle 0/1
|
|---|
| 750 | nop.m 0
|
|---|
| 751 | movi0 ar.lc=s_lcsave
|
|---|
| 752 | movi0 pr=s_prsave,-1
|
|---|
| 753 | }{.mbb
|
|---|
| 754 | nop.m 0
|
|---|
| 755 | nop.b 0
|
|---|
| 756 | br.ret.sptk.many rp // ret0 is identical to best_len
|
|---|
| 757 | ;;
|
|---|
| 758 | }
|
|---|
| 759 | .endp
|
|---|
| 760 |
|
|---|
| 761 | .global match_init
|
|---|
| 762 | .proc match_init
|
|---|
| 763 | match_init:
|
|---|
| 764 | sub ret0=ret0,ret0
|
|---|
| 765 | br.ret.sptk.many rp
|
|---|
| 766 | .endp
|
|---|
| 767 |
|
|---|
| 768 | # else
|
|---|
| 769 | error: this asm version is for 386 or 680x0 or ia64 only
|
|---|
| 770 | # endif /* __ia64__ */
|
|---|
| 771 | #endif /* mc68000 || mc68020 */
|
|---|
| 772 | #endif /* i386 || _I386 */
|
|---|