2 * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
3 * See the copyright notice in the ACK home directory, in the file "Copyright".
5 /* $Id: LLlex.c,v 1.27 1994/06/27 07:57:57 ceriel Exp $ */
6 /* L E X I C A L A N A L Y Z E R */
25 #include "specials.h" /* registration of special identifiers */
27 /* Data about the token yielded */
28 struct token dot, ahead, aside;
29 int token_nmb = 0; /* number of the ahead token */
30 int tk_nmb_at_last_syn_err = -5/*ERR_SHADOW*/;
31 /* token number at last syntax error */
32 int idfsize = IDFSIZE;
33 char sp_occurred[SP_TOTAL+1];
36 int ReplaceMacros = 1; /* replacing macros */
37 int AccDefined = 0; /* accept "defined(...)" */
38 int UnknownIdIsZero = 0; /* interpret unknown id as integer 0 */
39 int Unstacked = 0; /* an unstack is done */
40 extern int InputLevel;
42 int AccFileSpecifier = 0; /* return filespecifier <...> */
43 int EoiForNewline = 0; /* return EOI upon encountering newline */
44 int File_Inserted = 0; /* a file has just been inserted */
45 int LexSave = 0; /* last character read by GetChar */
46 #define MAX_LL_DEPTH 2
48 #define FLG_ESEEN 0x01 /* possibly a floating point number */
49 #define FLG_DOTSEEN 0x02 /* certainly a floating point number */
50 extern arith full_mask[];
53 extern int lint_skip_comment;
57 static struct token LexStack[MAX_LL_DEPTH];
60 /* In PushLex() the actions are taken in order to initialise or
61 re-initialise the lexical scanner.
62 E.g. at the invocation of a sub-parser that uses LLlex(), the
63 state of the current parser should be saved.
67 ASSERT(LexSP < MAX_LL_DEPTH);
68 ASSERT(ASIDE == 0); /* ASIDE = 0; */
70 LexStack[LexSP++] = dot;
76 dot = LexStack[--LexSP];
83 /* LLlex() plays the role of Lexical Analyzer for the C parser.
84 The look-ahead and putting aside of tokens are taken into
87 if (ASIDE) { /* a token is put aside */
91 else { /* read ahead and return the old one */
96 /* the following test is performed due to the dual
97 task of LLlex(): it is also called for parsing the
98 restricted constant expression following a #if or
99 #elif. The newline character causes EOF to be
100 returned in this case to stop the LLgen parsing task.
111 char *string_token();
112 arith char_constant();
116 register struct token *ptok;
118 /* GetToken() is the actual token recognizer. It calls the
119 control line interpreter if it encounters a "\n{w}*#"
120 combination. Macro replacement is also performed if it is
123 char buf[(IDFSIZE > NUMSIZE ? IDFSIZE : NUMSIZE) + 1];
124 register int ch, nch;
133 again: /* rescan the input after an error or replacement */
135 go_on: /* rescan, the following character has been read */
136 if ((ch & 0200) && ch != EOI) /* stop on non-ascii character */
138 fatal("non-ascii '\\%03o' read", ch & 0377);
140 /* keep track of the place of the token in the file */
141 ptok->tk_file = FileName;
142 ptok->tk_line = LineNumber;
144 switch (class(ch)) { /* detect character class */
145 case STNL: /* newline, vertical space or formfeed */
147 LineNumber++; /* also at vs and ff */
148 ptok->tk_file = FileName;
149 ptok->tk_line = LineNumber;
150 if (EoiForNewline) /* called in control line */
151 /* a newline in a control line indicates the
152 end-of-information of the line.
154 return ptok->tk_symb = EOI;
156 while ((ch = GetChar()),
161 || class(ch) == STSKIP)) {
162 /* blanks are allowed before hashes */
164 /* a control line follows */
171 } else if (ch == '/') {
172 if ((GetChar() == '*') && !InputLevel) {
181 /* We have to loop here, because in
182 `domacro' the nl, vt or ff is read. The
183 character following it may again be a `#'.
186 case STSKIP: /* just skip the skip characters */
188 case STGARB: /* garbage character */
192 if (040 < ch && ch < 0177) {
193 return ptok->tk_symb = ch;
195 lexerror("garbage char \\%03o", ch);
198 case STSIMP: /* a simple character, no part of compound token*/
199 return ptok->tk_symb = ch;
200 case STCOMP: /* maybe the start of a compound token */
201 nch = GetChar(); /* character lookahead */
205 return ptok->tk_symb = NOTEQUAL;
209 return ptok->tk_symb = AND;
211 return ptok->tk_symb = ANDAB;
215 return ptok->tk_symb = PLUSPLUS;
217 return ptok->tk_symb = PLUSAB;
221 return ptok->tk_symb = MINMIN;
223 return ptok->tk_symb = ARROW;
225 return ptok->tk_symb = MINAB;
228 if (AccFileSpecifier) {
229 UnGetChar(); /* pushback nch */
230 ptok->tk_bts = string_token("file specifier",
231 '>', &(ptok->tk_len));
232 return ptok->tk_symb = FILESPECIFIER;
235 if ((nch = GetChar()) == '=')
236 return ptok->tk_symb = LEFTAB;
238 return ptok->tk_symb = LEFT;
241 return ptok->tk_symb = LESSEQ;
245 return ptok->tk_symb = EQUAL;
249 return ptok->tk_symb = GREATEREQ;
251 if ((nch = GetChar()) == '=')
252 return ptok->tk_symb = RIGHTAB;
254 return ptok->tk_symb = RIGHT;
259 return ptok->tk_symb = OR;
261 return ptok->tk_symb = ORAB;
265 return ptok->tk_symb = MODAB;
269 return ptok->tk_symb = TIMESAB;
273 return ptok->tk_symb = XORAB;
277 if (nch == '*' && !InputLevel) {
283 return ptok->tk_symb = DIVAB;
286 crash("bad class for char 0%o", ch);
290 return ptok->tk_symb = ch;
291 case STCHAR: /* character constant */
292 ptok->tk_ival = char_constant("character");
294 return ptok->tk_symb = INTEGER;
295 case STSTR: /* string */
296 ptok->tk_bts = string_token("string", '"', &(ptok->tk_len));
297 ptok->tk_fund = CHAR; /* string of characters */
298 return ptok->tk_symb = STRING;
299 case STELL: /* wide character constant/string prefix */
302 ptok->tk_bts = string_token("wide character string",
303 '"', &(ptok->tk_len));
304 ptok->tk_fund = WCHAR; /* string of wide characters */
305 return ptok->tk_symb = STRING;
306 } else if (nch == '\'') {
307 ptok->tk_ival = char_constant("wide character");
309 return ptok->tk_symb = INTEGER;
315 register char *tg = &buf[0];
316 register int pos = -1;
317 register struct idf *idef;
318 extern int idfsize; /* ??? */
320 int NoExpandNext = 0;
322 if (Unstacked) EnableMacros(); /* unstack macro's when allowed. */
328 do { /* read the identifier */
329 if (++pos < idfsize) {
333 } while (in_idf(ch));
337 *tg++ = '\0'; /* mark the end of the identifier */
338 idef = ptok->tk_idf = str2idf(buf, 1);
339 sp_occurred[idef->id_special] = 1;
340 idef->id_file = ptok->tk_file;
341 idef->id_line = ptok->tk_line;
343 if (idef->id_macro && ReplaceMacros && !NoExpandNext) {
347 if (UnknownIdIsZero && idef->id_reserved != SIZEOF) {
348 ptok->tk_ival = (arith)0;
350 return ptok->tk_symb = INTEGER;
356 : idef->id_def && idef->id_def->df_sc == TYPEDEF
362 case STNUM: /* a numeric constant */
364 register int siz_left = NUMSIZE - 1;
365 register char *np = &buf[0];
368 #define store(ch) if (--siz_left >= 0) \
372 /* An embarrasing ambiguity. We have either a
373 pp-number, a field operator, an ELLIPSIS or
377 if (!is_dig(ch)) { /* . or ... */
379 if ((ch = GetChar()) == '.')
380 return ptok->tk_symb = ELLIPSIS;
381 UnGetChar(); /* not '.' */
382 ChPushBack('.'); /* sigh ... */
384 UnGetChar(); /* not '.' */
385 return ptok->tk_symb = '.';
389 flags |= FLG_DOTSEEN;
393 while(in_idf(ch) || ch == '.') {
395 if (ch == '.') flags |= FLG_DOTSEEN;
396 if (ch == 'e' || ch == 'E') {
399 if (ch == '+' || ch == '-') {
400 flags |= FLG_DOTSEEN; /* trick */
404 } else ch = GetChar();
412 lexerror("number too long");
413 if ((flags & FLG_DOTSEEN)
414 || (flags & FLG_ESEEN
416 && (*np == 'x' || *np == 'X')))) {
417 ptok->tk_fval = Salloc("0.0", (unsigned) 4);
418 ptok->tk_fund = DOUBLE;
419 return ptok->tk_symb = FLOATING;
422 ptok->tk_fund = ULONG;
423 ptok->tk_symb = INTEGER;
425 /* Now, the pp-number must be converted into a token */
426 if ((flags & FLG_DOTSEEN)
427 || (flags & FLG_ESEEN
428 && !(ch == '0' && (*np == 'x' || *np == 'X')))) {
429 strflt2tok(&buf[0], ptok);
430 return ptok->tk_symb = FLOATING;
432 strint2tok(&buf[0], ptok);
433 return ptok->tk_symb = INTEGER;
435 case STEOI: /* end of text on source file */
436 return ptok->tk_symb = EOI;
439 if (!InputLevel) goto garbage;
440 if (ch == TOKSEP) goto again;
441 /* fallthrough shouldn't happen */
443 default: /* this cannot happen */
444 crash("bad class for char 0%o", ch);
452 /* The last character read has been the '*' of '/_*'. The
453 characters, except NL and EOI, between '/_*' and the first
454 occurring '*_/' are not interpreted.
455 NL only affects the LineNumber. EOI is not legal.
457 Important note: it is not possible to stop skipping comment
458 beyond the end-of-file of an included file.
459 EOI is returned by LoadChar only on encountering EOF of the
462 register int c, oldc = '\0';
467 if (! lint_skip_comment) {
468 lint_start_comment();
469 lint_comment_char(c);
474 if (class(c) == STNL) {
476 } else if (c == EOI) {
479 if (! lint_skip_comment) lint_end_comment();
486 if (! lint_skip_comment) lint_comment_char(c);
488 } /* last Character seen was '*' */
490 if ( c != '/' && oldc == '/')
491 lexwarning("comment inside comment ?");
494 if (! lint_skip_comment) lint_comment_char(c);
498 if (! lint_skip_comment) lint_end_comment();
508 register arith val = 0;
514 lexerror("%s constant too short", nm);
518 lexerror("newline in %s constant", nm);
523 ch = quoted(GetChar());
524 if (ch >= 128) ch -= 256;
525 if (size < (int)int_size)
526 val |= ch << 8 * size;
531 lexstrict("%s constant includes more than one character", nm);
532 if (size > (int)int_size)
533 lexerror("%s constant too long", nm);
538 string_token(nm, stop_char, plen)
543 register int str_size;
544 register char *str = Malloc((unsigned) (str_size = ISTRSIZE));
545 register int pos = 0;
548 while (ch != stop_char) {
550 lexerror("newline in %s", nm);
555 lexerror("end-of-file inside %s", nm);
558 if (ch == '\\' && !AccFileSpecifier)
559 ch = quoted(GetChar());
562 str = Realloc(str, (unsigned) (str_size += RSTRSIZE));
565 str[pos++] = '\0'; /* for filenames etc. */
574 /* quoted() replaces an escaped character sequence by the
577 /* first char after backslash already in ch */
578 if (!is_oct(ch)) { /* a quoted char */
595 case 'a': /* alert */
598 case 'v': /* vertical tab */
601 case 'x': /* quoted hex */
603 register int hex = 0;
608 if ((vch = hex_val(ch)) == -1)
610 hex = hex * 16 + vch;
617 else { /* a quoted octal */
618 register int oct = 0, cnt = 0;
621 oct = oct*8 + (ch-'0');
623 } while (is_oct(ch) && ++cnt < 3);
635 return is_dig(ch) ? ch - '0'
636 : is_hex(ch) ? (ch - 'a' + 10) & 017
644 /* The routines GetChar and trigraph parses the trigraph
645 sequences and removes occurences of \\\n.
655 /* possible trigraph sequence */
659 /* \<newline> is removed from the input stream */
670 return(LexSave = ch);
682 switch (ch) { /* its a trigraph */
718 /* strflt2tok only checks the syntax of the floating-point number and
719 * selects the right type for the number.
721 strflt2tok(fltbuf, ptok)
725 register char *cp = fltbuf;
728 while (is_dig(*cp)) cp++;
731 while (is_dig(*cp)) cp++;
733 if (*cp == 'e' || *cp == 'E') {
735 if (*cp == '+' || *cp == '-')
737 if (!is_dig(*cp)) malformed++;
738 while (is_dig(*cp)) cp++;
740 if (*cp == 'f' || *cp == 'F') {
741 if (*(cp + 1)) malformed++;
743 ptok->tk_fund = FLOAT;
744 } else if (*cp == 'l' || *cp == 'L') {
745 if (*(cp + 1)) malformed++;
747 ptok->tk_fund = LNGDBL;
749 if (*cp) malformed++;
750 ptok->tk_fund = DOUBLE;
753 lexerror("malformed floating constant");
754 ptok->tk_fval = Salloc("0.0", (unsigned) 4);
756 ptok->tk_fval = Salloc(fltbuf, (unsigned) (cp - fltbuf + 1));
760 strint2tok(intbuf, ptok)
764 register char *cp = intbuf;
766 arith val = 0, dig, ubound;
767 int uns_flg = 0, lng_flg = 0, malformed = 0, ovfl = 0;
773 if (*cp == 'x' || *cp == 'X') {
778 /* The upperbound will be the same as when computed with
779 * max_unsigned_arith / base (since base is even). The problem here
780 * is that unsigned arith is not accepted by all compilers.
782 ubound = max_arith / (base / 2);
784 while (is_hex(*cp)) {
787 malformed++; /* ignore */
790 if (val < 0 || val > ubound) ovfl++;
792 if (val < 0 && val + dig >= 0) ovfl++;
799 if (*cp == 'l' || *cp == 'L') lng_flg++;
800 else if (*cp == 'u' || *cp == 'U') uns_flg++;
808 lexerror("malformed %s integer constant",
809 (base == 10 ? "decimal"
810 : (base == 8 ? "octal"
814 lexerror("only one long suffix allowed");
816 lexerror("only one unsigned suffix allowed");
819 lexwarning("overflow in constant");
821 } else if (!lng_flg && (val & full_mask[(int)int_size]) == val) {
822 if (val >= 0 && val <= max_int) {
824 } else if (int_size == long_size) {
826 } else if (base == 10 && !uns_flg)
828 else fund = UNSIGNED;
829 } else if((val & full_mask[(int)long_size]) == val) {
830 if (val >= 0) fund = LONG;
832 } else { /* sizeof(arith) is greater than long_size */
833 ASSERT(arith_size > long_size);
834 lexwarning("constant too large for target machine");
835 /* cut the size to prevent further complaints */
836 val &= full_mask[(int)long_size];
840 /* fund can't be INT */
841 if (fund == UNSIGNED) fund = ULONG;
844 if (fund == INT) fund = UNSIGNED;
845 else if (fund == LONG) fund = ULONG;
847 ptok->tk_fund = fund;