source: trunk/essentials/sys-devel/flex/nfa.c@ 3476

Last change on this file since 3476 was 3043, checked in by bird, 19 years ago

-> essentials

File size: 17.4 KB
Line 
1/* nfa - NFA construction routines */
2
3/* Copyright (c) 1990 The Regents of the University of California. */
4/* All rights reserved. */
5
6/* This code is derived from software contributed to Berkeley by */
7/* Vern Paxson. */
8
9/* The United States Government has rights in this work pursuant */
10/* to contract no. DE-AC03-76SF00098 between the United States */
11/* Department of Energy and the University of California. */
12
13/* This file is part of flex. */
14
15/* Redistribution and use in source and binary forms, with or without */
16/* modification, are permitted provided that the following conditions */
17/* are met: */
18
19/* 1. Redistributions of source code must retain the above copyright */
20/* notice, this list of conditions and the following disclaimer. */
21/* 2. Redistributions in binary form must reproduce the above copyright */
22/* notice, this list of conditions and the following disclaimer in the */
23/* documentation and/or other materials provided with the distribution. */
24
25/* Neither the name of the University nor the names of its contributors */
26/* may be used to endorse or promote products derived from this software */
27/* without specific prior written permission. */
28
29/* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
30/* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
31/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
32/* PURPOSE. */
33
34#include "flexdef.h"
35
36
37/* declare functions that have forward references */
38
39int dupmachine PROTO ((int));
40void mkxtion PROTO ((int, int));
41
42
43/* add_accept - add an accepting state to a machine
44 *
45 * accepting_number becomes mach's accepting number.
46 */
47
48void add_accept (mach, accepting_number)
49 int mach, accepting_number;
50{
51 /* Hang the accepting number off an epsilon state. if it is associated
52 * with a state that has a non-epsilon out-transition, then the state
53 * will accept BEFORE it makes that transition, i.e., one character
54 * too soon.
55 */
56
57 if (transchar[finalst[mach]] == SYM_EPSILON)
58 accptnum[finalst[mach]] = accepting_number;
59
60 else {
61 int astate = mkstate (SYM_EPSILON);
62
63 accptnum[astate] = accepting_number;
64 (void) link_machines (mach, astate);
65 }
66}
67
68
69/* copysingl - make a given number of copies of a singleton machine
70 *
71 * synopsis
72 *
73 * newsng = copysingl( singl, num );
74 *
75 * newsng - a new singleton composed of num copies of singl
76 * singl - a singleton machine
77 * num - the number of copies of singl to be present in newsng
78 */
79
80int copysingl (singl, num)
81 int singl, num;
82{
83 int copy, i;
84
85 copy = mkstate (SYM_EPSILON);
86
87 for (i = 1; i <= num; ++i)
88 copy = link_machines (copy, dupmachine (singl));
89
90 return copy;
91}
92
93
94/* dumpnfa - debugging routine to write out an nfa */
95
96void dumpnfa (state1)
97 int state1;
98
99{
100 int sym, tsp1, tsp2, anum, ns;
101
102 fprintf (stderr,
103 _
104 ("\n\n********** beginning dump of nfa with start state %d\n"),
105 state1);
106
107 /* We probably should loop starting at firstst[state1] and going to
108 * lastst[state1], but they're not maintained properly when we "or"
109 * all of the rules together. So we use our knowledge that the machine
110 * starts at state 1 and ends at lastnfa.
111 */
112
113 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
114 for (ns = 1; ns <= lastnfa; ++ns) {
115 fprintf (stderr, _("state # %4d\t"), ns);
116
117 sym = transchar[ns];
118 tsp1 = trans1[ns];
119 tsp2 = trans2[ns];
120 anum = accptnum[ns];
121
122 fprintf (stderr, "%3d: %4d, %4d", sym, tsp1, tsp2);
123
124 if (anum != NIL)
125 fprintf (stderr, " [%d]", anum);
126
127 fprintf (stderr, "\n");
128 }
129
130 fprintf (stderr, _("********** end of dump\n"));
131}
132
133
134/* dupmachine - make a duplicate of a given machine
135 *
136 * synopsis
137 *
138 * copy = dupmachine( mach );
139 *
140 * copy - holds duplicate of mach
141 * mach - machine to be duplicated
142 *
143 * note that the copy of mach is NOT an exact duplicate; rather, all the
144 * transition states values are adjusted so that the copy is self-contained,
145 * as the original should have been.
146 *
147 * also note that the original MUST be contiguous, with its low and high
148 * states accessible by the arrays firstst and lastst
149 */
150
151int dupmachine (mach)
152 int mach;
153{
154 int i, init, state_offset;
155 int state = 0;
156 int last = lastst[mach];
157
158 for (i = firstst[mach]; i <= last; ++i) {
159 state = mkstate (transchar[i]);
160
161 if (trans1[i] != NO_TRANSITION) {
162 mkxtion (finalst[state], trans1[i] + state - i);
163
164 if (transchar[i] == SYM_EPSILON &&
165 trans2[i] != NO_TRANSITION)
166 mkxtion (finalst[state],
167 trans2[i] + state - i);
168 }
169
170 accptnum[state] = accptnum[i];
171 }
172
173 if (state == 0)
174 flexfatal (_("empty machine in dupmachine()"));
175
176 state_offset = state - i + 1;
177
178 init = mach + state_offset;
179 firstst[init] = firstst[mach] + state_offset;
180 finalst[init] = finalst[mach] + state_offset;
181 lastst[init] = lastst[mach] + state_offset;
182
183 return init;
184}
185
186
187/* finish_rule - finish up the processing for a rule
188 *
189 * An accepting number is added to the given machine. If variable_trail_rule
190 * is true then the rule has trailing context and both the head and trail
191 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
192 * the machine recognizes a pattern with trailing context and headcnt is
193 * the number of characters in the matched part of the pattern, or zero
194 * if the matched part has variable length. trailcnt is the number of
195 * trailing context characters in the pattern, or zero if the trailing
196 * context has variable length.
197 */
198
199void finish_rule (mach, variable_trail_rule, headcnt, trailcnt,
200 pcont_act)
201 int mach, variable_trail_rule, headcnt, trailcnt, pcont_act;
202{
203 char action_text[MAXLINE];
204
205 add_accept (mach, num_rules);
206
207 /* We did this in new_rule(), but it often gets the wrong
208 * number because we do it before we start parsing the current rule.
209 */
210 rule_linenum[num_rules] = linenum;
211
212 /* If this is a continued action, then the line-number has already
213 * been updated, giving us the wrong number.
214 */
215 if (continued_action)
216 --rule_linenum[num_rules];
217
218
219 /* If the previous rule was continued action, then we inherit the
220 * previous newline flag, possibly overriding the current one.
221 */
222 if (pcont_act && rule_has_nl[num_rules - 1])
223 rule_has_nl[num_rules] = true;
224
225 sprintf (action_text, "case %d:\n", num_rules);
226 add_action (action_text);
227 if (rule_has_nl[num_rules]) {
228 sprintf (action_text, "/* rule %d can match eol */\n",
229 num_rules);
230 add_action (action_text);
231 }
232
233
234 if (variable_trail_rule) {
235 rule_type[num_rules] = RULE_VARIABLE;
236
237 if (performance_report > 0)
238 fprintf (stderr,
239 _
240 ("Variable trailing context rule at line %d\n"),
241 rule_linenum[num_rules]);
242
243 variable_trailing_context_rules = true;
244 }
245
246 else {
247 rule_type[num_rules] = RULE_NORMAL;
248
249 if (headcnt > 0 || trailcnt > 0) {
250 /* Do trailing context magic to not match the trailing
251 * characters.
252 */
253 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
254 char *scanner_bp = "yy_bp";
255
256 add_action
257 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
258
259 if (headcnt > 0) {
260 sprintf (action_text, "%s = %s + %d;\n",
261 scanner_cp, scanner_bp, headcnt);
262 add_action (action_text);
263 }
264
265 else {
266 sprintf (action_text, "%s -= %d;\n",
267 scanner_cp, trailcnt);
268 add_action (action_text);
269 }
270
271 add_action
272 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
273 }
274 }
275
276 /* Okay, in the action code at this point yytext and yyleng have
277 * their proper final values for this rule, so here's the point
278 * to do any user action. But don't do it for continued actions,
279 * as that'll result in multiple YY_RULE_SETUP's.
280 */
281 if (!continued_action)
282 add_action ("YY_RULE_SETUP\n");
283
284 line_directive_out ((FILE *) 0, 1);
285}
286
287
288/* link_machines - connect two machines together
289 *
290 * synopsis
291 *
292 * new = link_machines( first, last );
293 *
294 * new - a machine constructed by connecting first to last
295 * first - the machine whose successor is to be last
296 * last - the machine whose predecessor is to be first
297 *
298 * note: this routine concatenates the machine first with the machine
299 * last to produce a machine new which will pattern-match first first
300 * and then last, and will fail if either of the sub-patterns fails.
301 * FIRST is set to new by the operation. last is unmolested.
302 */
303
304int link_machines (first, last)
305 int first, last;
306{
307 if (first == NIL)
308 return last;
309
310 else if (last == NIL)
311 return first;
312
313 else {
314 mkxtion (finalst[first], last);
315 finalst[first] = finalst[last];
316 lastst[first] = MAX (lastst[first], lastst[last]);
317 firstst[first] = MIN (firstst[first], firstst[last]);
318
319 return first;
320 }
321}
322
323
324/* mark_beginning_as_normal - mark each "beginning" state in a machine
325 * as being a "normal" (i.e., not trailing context-
326 * associated) states
327 *
328 * The "beginning" states are the epsilon closure of the first state
329 */
330
331void mark_beginning_as_normal (mach)
332 register int mach;
333{
334 switch (state_type[mach]) {
335 case STATE_NORMAL:
336 /* Oh, we've already visited here. */
337 return;
338
339 case STATE_TRAILING_CONTEXT:
340 state_type[mach] = STATE_NORMAL;
341
342 if (transchar[mach] == SYM_EPSILON) {
343 if (trans1[mach] != NO_TRANSITION)
344 mark_beginning_as_normal (trans1[mach]);
345
346 if (trans2[mach] != NO_TRANSITION)
347 mark_beginning_as_normal (trans2[mach]);
348 }
349 break;
350
351 default:
352 flexerror (_
353 ("bad state type in mark_beginning_as_normal()"));
354 break;
355 }
356}
357
358
359/* mkbranch - make a machine that branches to two machines
360 *
361 * synopsis
362 *
363 * branch = mkbranch( first, second );
364 *
365 * branch - a machine which matches either first's pattern or second's
366 * first, second - machines whose patterns are to be or'ed (the | operator)
367 *
368 * Note that first and second are NEITHER destroyed by the operation. Also,
369 * the resulting machine CANNOT be used with any other "mk" operation except
370 * more mkbranch's. Compare with mkor()
371 */
372
373int mkbranch (first, second)
374 int first, second;
375{
376 int eps;
377
378 if (first == NO_TRANSITION)
379 return second;
380
381 else if (second == NO_TRANSITION)
382 return first;
383
384 eps = mkstate (SYM_EPSILON);
385
386 mkxtion (eps, first);
387 mkxtion (eps, second);
388
389 return eps;
390}
391
392
393/* mkclos - convert a machine into a closure
394 *
395 * synopsis
396 * new = mkclos( state );
397 *
398 * new - a new state which matches the closure of "state"
399 */
400
401int mkclos (state)
402 int state;
403{
404 return mkopt (mkposcl (state));
405}
406
407
408/* mkopt - make a machine optional
409 *
410 * synopsis
411 *
412 * new = mkopt( mach );
413 *
414 * new - a machine which optionally matches whatever mach matched
415 * mach - the machine to make optional
416 *
417 * notes:
418 * 1. mach must be the last machine created
419 * 2. mach is destroyed by the call
420 */
421
422int mkopt (mach)
423 int mach;
424{
425 int eps;
426
427 if (!SUPER_FREE_EPSILON (finalst[mach])) {
428 eps = mkstate (SYM_EPSILON);
429 mach = link_machines (mach, eps);
430 }
431
432 /* Can't skimp on the following if FREE_EPSILON(mach) is true because
433 * some state interior to "mach" might point back to the beginning
434 * for a closure.
435 */
436 eps = mkstate (SYM_EPSILON);
437 mach = link_machines (eps, mach);
438
439 mkxtion (mach, finalst[mach]);
440
441 return mach;
442}
443
444
445/* mkor - make a machine that matches either one of two machines
446 *
447 * synopsis
448 *
449 * new = mkor( first, second );
450 *
451 * new - a machine which matches either first's pattern or second's
452 * first, second - machines whose patterns are to be or'ed (the | operator)
453 *
454 * note that first and second are both destroyed by the operation
455 * the code is rather convoluted because an attempt is made to minimize
456 * the number of epsilon states needed
457 */
458
459int mkor (first, second)
460 int first, second;
461{
462 int eps, orend;
463
464 if (first == NIL)
465 return second;
466
467 else if (second == NIL)
468 return first;
469
470 else {
471 /* See comment in mkopt() about why we can't use the first
472 * state of "first" or "second" if they satisfy "FREE_EPSILON".
473 */
474 eps = mkstate (SYM_EPSILON);
475
476 first = link_machines (eps, first);
477
478 mkxtion (first, second);
479
480 if (SUPER_FREE_EPSILON (finalst[first]) &&
481 accptnum[finalst[first]] == NIL) {
482 orend = finalst[first];
483 mkxtion (finalst[second], orend);
484 }
485
486 else if (SUPER_FREE_EPSILON (finalst[second]) &&
487 accptnum[finalst[second]] == NIL) {
488 orend = finalst[second];
489 mkxtion (finalst[first], orend);
490 }
491
492 else {
493 eps = mkstate (SYM_EPSILON);
494
495 first = link_machines (first, eps);
496 orend = finalst[first];
497
498 mkxtion (finalst[second], orend);
499 }
500 }
501
502 finalst[first] = orend;
503 return first;
504}
505
506
507/* mkposcl - convert a machine into a positive closure
508 *
509 * synopsis
510 * new = mkposcl( state );
511 *
512 * new - a machine matching the positive closure of "state"
513 */
514
515int mkposcl (state)
516 int state;
517{
518 int eps;
519
520 if (SUPER_FREE_EPSILON (finalst[state])) {
521 mkxtion (finalst[state], state);
522 return state;
523 }
524
525 else {
526 eps = mkstate (SYM_EPSILON);
527 mkxtion (eps, state);
528 return link_machines (state, eps);
529 }
530}
531
532
533/* mkrep - make a replicated machine
534 *
535 * synopsis
536 * new = mkrep( mach, lb, ub );
537 *
538 * new - a machine that matches whatever "mach" matched from "lb"
539 * number of times to "ub" number of times
540 *
541 * note
542 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
543 */
544
545int mkrep (mach, lb, ub)
546 int mach, lb, ub;
547{
548 int base_mach, tail, copy, i;
549
550 base_mach = copysingl (mach, lb - 1);
551
552 if (ub == INFINITE_REPEAT) {
553 copy = dupmachine (mach);
554 mach = link_machines (mach,
555 link_machines (base_mach,
556 mkclos (copy)));
557 }
558
559 else {
560 tail = mkstate (SYM_EPSILON);
561
562 for (i = lb; i < ub; ++i) {
563 copy = dupmachine (mach);
564 tail = mkopt (link_machines (copy, tail));
565 }
566
567 mach =
568 link_machines (mach,
569 link_machines (base_mach, tail));
570 }
571
572 return mach;
573}
574
575
576/* mkstate - create a state with a transition on a given symbol
577 *
578 * synopsis
579 *
580 * state = mkstate( sym );
581 *
582 * state - a new state matching sym
583 * sym - the symbol the new state is to have an out-transition on
584 *
585 * note that this routine makes new states in ascending order through the
586 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
587 * relies on machines being made in ascending order and that they are
588 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
589 * that it admittedly is)
590 */
591
592int mkstate (sym)
593 int sym;
594{
595 if (++lastnfa >= current_mns) {
596 if ((current_mns += MNS_INCREMENT) >= maximum_mns)
597 lerrif (_
598 ("input rules are too complicated (>= %d NFA states)"),
599current_mns);
600
601 ++num_reallocs;
602
603 firstst = reallocate_integer_array (firstst, current_mns);
604 lastst = reallocate_integer_array (lastst, current_mns);
605 finalst = reallocate_integer_array (finalst, current_mns);
606 transchar =
607 reallocate_integer_array (transchar, current_mns);
608 trans1 = reallocate_integer_array (trans1, current_mns);
609 trans2 = reallocate_integer_array (trans2, current_mns);
610 accptnum =
611 reallocate_integer_array (accptnum, current_mns);
612 assoc_rule =
613 reallocate_integer_array (assoc_rule, current_mns);
614 state_type =
615 reallocate_integer_array (state_type, current_mns);
616 }
617
618 firstst[lastnfa] = lastnfa;
619 finalst[lastnfa] = lastnfa;
620 lastst[lastnfa] = lastnfa;
621 transchar[lastnfa] = sym;
622 trans1[lastnfa] = NO_TRANSITION;
623 trans2[lastnfa] = NO_TRANSITION;
624 accptnum[lastnfa] = NIL;
625 assoc_rule[lastnfa] = num_rules;
626 state_type[lastnfa] = current_state_type;
627
628 /* Fix up equivalence classes base on this transition. Note that any
629 * character which has its own transition gets its own equivalence
630 * class. Thus only characters which are only in character classes
631 * have a chance at being in the same equivalence class. E.g. "a|b"
632 * puts 'a' and 'b' into two different equivalence classes. "[ab]"
633 * puts them in the same equivalence class (barring other differences
634 * elsewhere in the input).
635 */
636
637 if (sym < 0) {
638 /* We don't have to update the equivalence classes since
639 * that was already done when the ccl was created for the
640 * first time.
641 */
642 }
643
644 else if (sym == SYM_EPSILON)
645 ++numeps;
646
647 else {
648 check_char (sym);
649
650 if (useecs)
651 /* Map NUL's to csize. */
652 mkechar (sym ? sym : csize, nextecm, ecgroup);
653 }
654
655 return lastnfa;
656}
657
658
659/* mkxtion - make a transition from one state to another
660 *
661 * synopsis
662 *
663 * mkxtion( statefrom, stateto );
664 *
665 * statefrom - the state from which the transition is to be made
666 * stateto - the state to which the transition is to be made
667 */
668
669void mkxtion (statefrom, stateto)
670 int statefrom, stateto;
671{
672 if (trans1[statefrom] == NO_TRANSITION)
673 trans1[statefrom] = stateto;
674
675 else if ((transchar[statefrom] != SYM_EPSILON) ||
676 (trans2[statefrom] != NO_TRANSITION))
677 flexfatal (_("found too many transitions in mkxtion()"));
678
679 else { /* second out-transition for an epsilon state */
680 ++eps2;
681 trans2[statefrom] = stateto;
682 }
683}
684
685/* new_rule - initialize for a new rule */
686
687void new_rule ()
688{
689 if (++num_rules >= current_max_rules) {
690 ++num_reallocs;
691 current_max_rules += MAX_RULES_INCREMENT;
692 rule_type = reallocate_integer_array (rule_type,
693 current_max_rules);
694 rule_linenum = reallocate_integer_array (rule_linenum,
695 current_max_rules);
696 rule_useful = reallocate_integer_array (rule_useful,
697 current_max_rules);
698 rule_has_nl = reallocate_bool_array (rule_has_nl,
699 current_max_rules);
700 }
701
702 if (num_rules > MAX_RULE)
703 lerrif (_("too many rules (> %d)!"), MAX_RULE);
704
705 rule_linenum[num_rules] = linenum;
706 rule_useful[num_rules] = false;
707 rule_has_nl[num_rules] = false;
708}
Note: See TracBrowser for help on using the repository browser.