1 /* nfa - NFA construction routines */
4 * Copyright (c) 1990 The Regents of the University of California.
7 * This code is derived from software contributed to Berkeley by
10 * The United States Government has rights in this work pursuant
11 * to contract no. DE-AC03-76SF00098 between the United States
12 * Department of Energy and the University of California.
14 * Redistribution and use in source and binary forms are permitted provided
15 * that: (1) source distributions retain this entire copyright notice and
16 * comment, and (2) distributions including binaries display the following
17 * acknowledgement: ``This product includes software developed by the
18 * University of California, Berkeley and its contributors'' in the
19 * documentation or other materials provided with the distribution and in
20 * all advertising materials mentioning features or use of this software.
21 * Neither the name of the University nor the names of its contributors may
22 * be used to endorse or promote products derived from this software without
23 * specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
25 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
26 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
31 "@(#) $Id: nfa.c,v 1.2 1994/06/24 10:57:15 ceriel Exp $ (LBL)";
37 /* declare functions that have forward references */
39 int dupmachine PROTO((int));
40 void mkxtion PROTO((int, int));
43 /* add_accept - add an accepting state to a machine
47 * add_accept( mach, accepting_number );
49 * accepting_number becomes mach's accepting number.
52 void add_accept( mach, accepting_number )
53 int mach, accepting_number;
56 /* hang the accepting number off an epsilon state. if it is associated
57 * with a state that has a non-epsilon out-transition, then the state
58 * will accept BEFORE it makes that transition, i.e., one character
62 if ( transchar[finalst[mach]] == SYM_EPSILON )
63 accptnum[finalst[mach]] = accepting_number;
67 int astate = mkstate( SYM_EPSILON );
68 accptnum[astate] = accepting_number;
69 mach = link_machines( mach, astate );
74 /* copysingl - make a given number of copies of a singleton machine
78 * newsng = copysingl( singl, num );
80 * newsng - a new singleton composed of num copies of singl
81 * singl - a singleton machine
82 * num - the number of copies of singl to be present in newsng
85 int copysingl( singl, num )
91 copy = mkstate( SYM_EPSILON );
93 for ( i = 1; i <= num; ++i )
94 copy = link_machines( copy, dupmachine( singl ) );
100 /* dumpnfa - debugging routine to write out an nfa
107 void dumpnfa( state1 )
111 int sym, tsp1, tsp2, anum, ns;
113 fprintf( stderr, "\n\n********** beginning dump of nfa with start state %d\n",
116 /* we probably should loop starting at firstst[state1] and going to
117 * lastst[state1], but they're not maintained properly when we "or"
118 * all of the rules together. So we use our knowledge that the machine
119 * starts at state 1 and ends at lastnfa.
122 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
123 for ( ns = 1; ns <= lastnfa; ++ns )
125 fprintf( stderr, "state # %4d\t", ns );
132 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
135 fprintf( stderr, " [%d]", anum );
137 fprintf( stderr, "\n" );
140 fprintf( stderr, "********** end of dump\n" );
144 /* dupmachine - make a duplicate of a given machine
148 * copy = dupmachine( mach );
150 * copy - holds duplicate of mach
151 * mach - machine to be duplicated
153 * note that the copy of mach is NOT an exact duplicate; rather, all the
154 * transition states values are adjusted so that the copy is self-contained,
155 * as the original should have been.
157 * also note that the original MUST be contiguous, with its low and high
158 * states accessible by the arrays firstst and lastst
161 int dupmachine( mach )
165 int i, init, state_offset;
167 int last = lastst[mach];
169 for ( i = firstst[mach]; i <= last; ++i )
171 state = mkstate( transchar[i] );
173 if ( trans1[i] != NO_TRANSITION )
175 mkxtion( finalst[state], trans1[i] + state - i );
177 if ( transchar[i] == SYM_EPSILON && trans2[i] != NO_TRANSITION )
178 mkxtion( finalst[state], trans2[i] + state - i );
181 accptnum[state] = accptnum[i];
185 flexfatal( "empty machine in dupmachine()" );
187 state_offset = state - i + 1;
189 init = mach + state_offset;
190 firstst[init] = firstst[mach] + state_offset;
191 finalst[init] = finalst[mach] + state_offset;
192 lastst[init] = lastst[mach] + state_offset;
198 /* finish_rule - finish up the processing for a rule
202 * finish_rule( mach, variable_trail_rule, headcnt, trailcnt );
204 * An accepting number is added to the given machine. If variable_trail_rule
205 * is true then the rule has trailing context and both the head and trail
206 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
207 * the machine recognizes a pattern with trailing context and headcnt is
208 * the number of characters in the matched part of the pattern, or zero
209 * if the matched part has variable length. trailcnt is the number of
210 * trailing context characters in the pattern, or zero if the trailing
211 * context has variable length.
214 void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
215 int mach, variable_trail_rule, headcnt, trailcnt;
218 add_accept( mach, num_rules );
220 /* we did this in new_rule(), but it often gets the wrong
221 * number because we do it before we start parsing the current rule
223 rule_linenum[num_rules] = linenum;
225 /* if this is a continued action, then the line-number has
226 * already been updated, giving us the wrong number
228 if ( continued_action )
229 --rule_linenum[num_rules];
231 fprintf( temp_action_file, "case %d:\n", num_rules );
233 if ( variable_trail_rule )
235 rule_type[num_rules] = RULE_VARIABLE;
237 if ( performance_report )
238 fprintf( stderr, "Variable trailing context rule at line %d\n",
239 rule_linenum[num_rules] );
241 variable_trailing_context_rules = true;
246 rule_type[num_rules] = RULE_NORMAL;
248 if ( headcnt > 0 || trailcnt > 0 )
250 /* do trailing context magic to not match the trailing characters */
251 char *scanner_cp = "yy_c_buf_p = yy_cp";
252 char *scanner_bp = "yy_bp";
254 fprintf( temp_action_file,
255 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
258 fprintf( temp_action_file, "%s = %s + %d;\n",
259 scanner_cp, scanner_bp, headcnt );
262 fprintf( temp_action_file,
263 "%s -= %d;\n", scanner_cp, trailcnt );
265 fprintf( temp_action_file,
266 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
270 line_directive_out( temp_action_file );
274 /* link_machines - connect two machines together
278 * new = link_machines( first, last );
280 * new - a machine constructed by connecting first to last
281 * first - the machine whose successor is to be last
282 * last - the machine whose predecessor is to be first
284 * note: this routine concatenates the machine first with the machine
285 * last to produce a machine new which will pattern-match first first
286 * and then last, and will fail if either of the sub-patterns fails.
287 * FIRST is set to new by the operation. last is unmolested.
290 int link_machines( first, last )
297 else if ( last == NIL )
302 mkxtion( finalst[first], last );
303 finalst[first] = finalst[last];
304 lastst[first] = max( lastst[first], lastst[last] );
305 firstst[first] = min( firstst[first], firstst[last] );
312 /* mark_beginning_as_normal - mark each "beginning" state in a machine
313 * as being a "normal" (i.e., not trailing context-
318 * mark_beginning_as_normal( mach )
320 * mach - machine to mark
322 * The "beginning" states are the epsilon closure of the first state
325 void mark_beginning_as_normal( mach )
329 switch ( state_type[mach] )
332 /* oh, we've already visited here */
335 case STATE_TRAILING_CONTEXT:
336 state_type[mach] = STATE_NORMAL;
338 if ( transchar[mach] == SYM_EPSILON )
340 if ( trans1[mach] != NO_TRANSITION )
341 mark_beginning_as_normal( trans1[mach] );
343 if ( trans2[mach] != NO_TRANSITION )
344 mark_beginning_as_normal( trans2[mach] );
349 flexerror( "bad state type in mark_beginning_as_normal()" );
355 /* mkbranch - make a machine that branches to two machines
359 * branch = mkbranch( first, second );
361 * branch - a machine which matches either first's pattern or second's
362 * first, second - machines whose patterns are to be or'ed (the | operator)
364 * note that first and second are NEITHER destroyed by the operation. Also,
365 * the resulting machine CANNOT be used with any other "mk" operation except
366 * more mkbranch's. Compare with mkor()
369 int mkbranch( first, second )
375 if ( first == NO_TRANSITION )
378 else if ( second == NO_TRANSITION )
381 eps = mkstate( SYM_EPSILON );
383 mkxtion( eps, first );
384 mkxtion( eps, second );
390 /* mkclos - convert a machine into a closure
393 * new = mkclos( state );
395 * new - a new state which matches the closure of "state"
402 return ( mkopt( mkposcl( state ) ) );
406 /* mkopt - make a machine optional
410 * new = mkopt( mach );
412 * new - a machine which optionally matches whatever mach matched
413 * mach - the machine to make optional
416 * 1. mach must be the last machine created
417 * 2. mach is destroyed by the call
426 if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
428 eps = mkstate( SYM_EPSILON );
429 mach = link_machines( mach, eps );
432 /* can't skimp on the following if FREE_EPSILON(mach) is true because
433 * some state interior to "mach" might point back to the beginning
436 eps = mkstate( SYM_EPSILON );
437 mach = link_machines( eps, mach );
439 mkxtion( mach, finalst[mach] );
445 /* mkor - make a machine that matches either one of two machines
449 * new = mkor( first, second );
451 * new - a machine which matches either first's pattern or second's
452 * first, second - machines whose patterns are to be or'ed (the | operator)
454 * note that first and second are both destroyed by the operation
455 * the code is rather convoluted because an attempt is made to minimize
456 * the number of epsilon states needed
459 int mkor( first, second )
468 else if ( second == NIL )
473 /* see comment in mkopt() about why we can't use the first state
474 * of "first" or "second" if they satisfy "FREE_EPSILON"
476 eps = mkstate( SYM_EPSILON );
478 first = link_machines( eps, first );
480 mkxtion( first, second );
482 if ( SUPER_FREE_EPSILON(finalst[first]) &&
483 accptnum[finalst[first]] == NIL )
485 orend = finalst[first];
486 mkxtion( finalst[second], orend );
489 else if ( SUPER_FREE_EPSILON(finalst[second]) &&
490 accptnum[finalst[second]] == NIL )
492 orend = finalst[second];
493 mkxtion( finalst[first], orend );
498 eps = mkstate( SYM_EPSILON );
500 first = link_machines( first, eps );
501 orend = finalst[first];
503 mkxtion( finalst[second], orend );
507 finalst[first] = orend;
512 /* mkposcl - convert a machine into a positive closure
515 * new = mkposcl( state );
517 * new - a machine matching the positive closure of "state"
526 if ( SUPER_FREE_EPSILON(finalst[state]) )
528 mkxtion( finalst[state], state );
534 eps = mkstate( SYM_EPSILON );
535 mkxtion( eps, state );
536 return ( link_machines( state, eps ) );
541 /* mkrep - make a replicated machine
544 * new = mkrep( mach, lb, ub );
546 * new - a machine that matches whatever "mach" matched from "lb"
547 * number of times to "ub" number of times
550 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
553 int mkrep( mach, lb, ub )
557 int base_mach, tail, copy, i;
559 base_mach = copysingl( mach, lb - 1 );
561 if ( ub == INFINITY )
563 copy = dupmachine( mach );
564 mach = link_machines( mach,
565 link_machines( base_mach, mkclos( copy ) ) );
570 tail = mkstate( SYM_EPSILON );
572 for ( i = lb; i < ub; ++i )
574 copy = dupmachine( mach );
575 tail = mkopt( link_machines( copy, tail ) );
578 mach = link_machines( mach, link_machines( base_mach, tail ) );
585 /* mkstate - create a state with a transition on a given symbol
589 * state = mkstate( sym );
591 * state - a new state matching sym
592 * sym - the symbol the new state is to have an out-transition on
594 * note that this routine makes new states in ascending order through the
595 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
596 * relies on machines being made in ascending order and that they are
597 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
598 * that it admittedly is)
605 if ( ++lastnfa >= current_mns )
607 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
608 lerrif( "input rules are too complicated (>= %d NFA states)",
613 firstst = reallocate_integer_array( firstst, current_mns );
614 lastst = reallocate_integer_array( lastst, current_mns );
615 finalst = reallocate_integer_array( finalst, current_mns );
616 transchar = reallocate_integer_array( transchar, current_mns );
617 trans1 = reallocate_integer_array( trans1, current_mns );
618 trans2 = reallocate_integer_array( trans2, current_mns );
619 accptnum = reallocate_integer_array( accptnum, current_mns );
620 assoc_rule = reallocate_integer_array( assoc_rule, current_mns );
621 state_type = reallocate_integer_array( state_type, current_mns );
624 firstst[lastnfa] = lastnfa;
625 finalst[lastnfa] = lastnfa;
626 lastst[lastnfa] = lastnfa;
627 transchar[lastnfa] = sym;
628 trans1[lastnfa] = NO_TRANSITION;
629 trans2[lastnfa] = NO_TRANSITION;
630 accptnum[lastnfa] = NIL;
631 assoc_rule[lastnfa] = num_rules;
632 state_type[lastnfa] = current_state_type;
634 /* fix up equivalence classes base on this transition. Note that any
635 * character which has its own transition gets its own equivalence class.
636 * Thus only characters which are only in character classes have a chance
637 * at being in the same equivalence class. E.g. "a|b" puts 'a' and 'b'
638 * into two different equivalence classes. "[ab]" puts them in the same
639 * equivalence class (barring other differences elsewhere in the input).
644 /* we don't have to update the equivalence classes since that was
645 * already done when the ccl was created for the first time
649 else if ( sym == SYM_EPSILON )
655 /* map NUL's to csize */
656 mkechar( sym ? sym : csize, nextecm, ecgroup );
663 /* mkxtion - make a transition from one state to another
667 * mkxtion( statefrom, stateto );
669 * statefrom - the state from which the transition is to be made
670 * stateto - the state to which the transition is to be made
673 void mkxtion( statefrom, stateto )
674 int statefrom, stateto;
677 if ( trans1[statefrom] == NO_TRANSITION )
678 trans1[statefrom] = stateto;
680 else if ( (transchar[statefrom] != SYM_EPSILON) ||
681 (trans2[statefrom] != NO_TRANSITION) )
682 flexfatal( "found too many transitions in mkxtion()" );
685 { /* second out-transition for an epsilon state */
687 trans2[statefrom] = stateto;
691 /* new_rule - initialize for a new rule
697 * the global num_rules is incremented and the any corresponding dynamic
698 * arrays (such as rule_type[]) are grown as needed.
704 if ( ++num_rules >= current_max_rules )
707 current_max_rules += MAX_RULES_INCREMENT;
708 rule_type = reallocate_integer_array( rule_type, current_max_rules );
710 reallocate_integer_array( rule_linenum, current_max_rules );
713 if ( num_rules > MAX_RULE )
714 lerrif( "too many rules (> %d)!", MAX_RULE );
716 rule_linenum[num_rules] = linenum;