--- /dev/null
+all: doc ndcode
+
+doc: ndcode
+ ${MAKE} -C $@
+
+ndcode:
+ ${MAKE} -C $@
+
+clean:
+ ${MAKE} -C ndcode clean
+ ${MAKE} -C doc clean
+
+.PHONY: all doc ndcode clean
-# Minimal makefile for Sphinx documentation
-#
+html:
+ sphinx-build -M $@ . _build
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = _build
+clean:
+ rm -rf _build
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: html clean
--- /dev/null
+/*
+ * classic.css_t
+ * ~~~~~~~~~~~~~
+ *
+ * Sphinx stylesheet -- classic theme.
+ *
+ * :copyright: Copyright 2007-2020 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+@import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+html {
+ /* CSS hack for macOS's scrollbar (see #1125) */
+ background-color: #FFFFFF;
+}
+
+body {
+ font-family: sans-serif;
+ font-size: 100%;
+ background-color: #11303d;
+ color: #000;
+ margin: 0;
+ padding: 0;
+}
+
+div.document {
+ background-color: #1c4e63;
+}
+
+div.documentwrapper {
+ float: left;
+ width: 100%;
+}
+
+div.bodywrapper {
+ margin: 0 0 0 230px;
+}
+
+div.body {
+ background-color: #ffffff;
+ color: #000000;
+ padding: 0 20px 30px 20px;
+}
+
+div.footer {
+ color: #ffffff;
+ width: 100%;
+ padding: 9px 0 9px 0;
+ text-align: center;
+ font-size: 75%;
+}
+
+div.footer a {
+ color: #ffffff;
+ text-decoration: underline;
+}
+
+div.related {
+ background-color: #133f52;
+ line-height: 30px;
+ color: #ffffff;
+}
+
+div.related a {
+ color: #ffffff;
+}
+
+div.sphinxsidebar {
+}
+
+div.sphinxsidebar h3 {
+ font-family: 'Trebuchet MS', sans-serif;
+ color: #ffffff;
+ font-size: 1.4em;
+ font-weight: normal;
+ margin: 0;
+ padding: 0;
+}
+
+div.sphinxsidebar h3 a {
+ color: #ffffff;
+}
+
+div.sphinxsidebar h4 {
+ font-family: 'Trebuchet MS', sans-serif;
+ color: #ffffff;
+ font-size: 1.3em;
+ font-weight: normal;
+ margin: 5px 0 0 0;
+ padding: 0;
+}
+
+div.sphinxsidebar p {
+ color: #ffffff;
+}
+
+div.sphinxsidebar p.topless {
+ margin: 5px 10px 10px 10px;
+}
+
+div.sphinxsidebar ul {
+ margin: 10px;
+ padding: 0;
+ color: #ffffff;
+}
+
+div.sphinxsidebar a {
+ color: #98dbcc;
+}
+
+div.sphinxsidebar input {
+ border: 1px solid #98dbcc;
+ font-family: sans-serif;
+ font-size: 1em;
+}
+
+
+
+/* -- hyperlink styles ------------------------------------------------------ */
+
+a {
+ color: #355f7c;
+ text-decoration: none;
+}
+
+a:visited {
+ color: #355f7c;
+ text-decoration: none;
+}
+
+a:hover {
+ text-decoration: underline;
+}
+
+
+
+/* -- body styles ----------------------------------------------------------- */
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+ font-family: 'Trebuchet MS', sans-serif;
+ background-color: #f2f2f2;
+ font-weight: normal;
+ color: #20435c;
+ border-bottom: 1px solid #ccc;
+ margin: 20px -20px 10px -20px;
+ padding: 3px 0 3px 10px;
+}
+
+div.body h1 { margin-top: 0; font-size: 200%; }
+div.body h2 { font-size: 160%; }
+div.body h3 { font-size: 140%; }
+div.body h4 { font-size: 120%; }
+div.body h5 { font-size: 110%; }
+div.body h6 { font-size: 100%; }
+
+a.headerlink {
+ color: #c60f0f;
+ font-size: 0.8em;
+ padding: 0 4px 0 4px;
+ text-decoration: none;
+}
+
+a.headerlink:hover {
+ background-color: #c60f0f;
+ color: white;
+}
+
+div.body p, div.body dd, div.body li, div.body blockquote {
+ text-align: left;
+ line-height: 130%;
+}
+
+div.admonition p.admonition-title + p {
+ display: inline;
+}
+
+div.admonition p {
+ margin-bottom: 5px;
+}
+
+div.admonition pre {
+ margin-bottom: 5px;
+}
+
+div.admonition ul, div.admonition ol {
+ margin-bottom: 5px;
+}
+
+div.note {
+ background-color: #eee;
+ border: 1px solid #ccc;
+}
+
+div.seealso {
+ background-color: #ffc;
+ border: 1px solid #ff6;
+}
+
+div.topic {
+ background-color: #eee;
+}
+
+div.warning {
+ background-color: #ffe4e4;
+ border: 1px solid #f66;
+}
+
+p.admonition-title {
+ display: inline;
+}
+
+p.admonition-title:after {
+ content: ":";
+}
+
+pre {
+ padding: 5px;
+ background-color: #eeffcc;
+ color: #333333;
+ line-height: 120%;
+ border: 1px solid #ac9;
+ border-left: none;
+ border-right: none;
+}
+
+code {
+ background-color: #ecf0f3;
+ padding: 0 1px 0 1px;
+ font-size: 0.95em;
+}
+
+th, dl.field-list > dt {
+ background-color: #ede;
+}
+
+.warning code {
+ background: #efc2c2;
+}
+
+.note code {
+ background: #d6d6d6;
+}
+
+.viewcode-back {
+ font-family: sans-serif;
+}
+
+div.viewcode-block:target {
+ background-color: #f4debf;
+ border-top: 1px solid #ac9;
+ border-bottom: 1px solid #ac9;
+}
+
+div.code-block-caption {
+ color: #efefef;
+ background-color: #1c4e63;
+}
--- /dev/null
+The ``bisect_set`` module
+=========================
+
+.. automodule:: ndcode.piyacc.bisect_set
+ :members:
--- /dev/null
+The ``bison_lr1dfa`` module
+===========================
+
+.. automodule:: ndcode.piyacc.bison_lr1dfa
+ :members:
--- /dev/null
+The ``cli`` module
+==================
+
+.. automodule:: ndcode.piyacc.cli
+ :members:
+
+Design of the module
+--------------------
+
+Step 1 is fairly straightforward Python code using ``getopt.getopt()``.
+
+Step 2 is more complicated, it uses a πlex/πyacc/πtree generated parser which
+automatically creates an abstract syntax tree representing the input, and then
+it performs several passes over the tree to extract the needed information such
+as lists of terminals and non-terminals, the rules, the alternatives of each
+rule, and the symbols and/or actions and other directives of each alternative.
+How it does this is somewhat beyond the scope of this documentation, as we
+would first have to refer you to tutorials in how to use πlex/πyacc/πtree.
+
+Step 3 is the focus of this document. Internally it uses a Python class library
+that we have developed for dealing with LR1 grammars and automata. See the
+classes ``ndcode.piyacc.lr1.LR1`` and ``ndcode.piyacc.lr1dfa.LR1DFA``. These
+classes are general-purpose and there is nothing to stop you pulling them out
+and using them in your project (subject to the license which is GPLv2). At the
+moment the classes are only used for generating automata to be used later, but
+there are commented functions in each class that can perform parsing directly.
+
+Step 4 is fairly straightforward Python code using ``str.format()``. Basically
+it reads a skeleton file line-by-line looking for special lines similar to::
+
+ # GENERATE SECTION1
+
+and then expands these into a sequence like::
+
+ # GENERATE SECTION1 BEGIN
+ ...
+ # GENERATE SECTION1 END
+
+where the middle part (denoted ``...``) is printed with a ``str.format()``
+format string containing the C or Python code to be inserted into the skeleton.
+
+Often this C or Python code will contain repetitive material (e.g. switch cases
+or action handler functions or similar) and these are sent into the format
+string as an string argument, which is a ``str.join()`` of a list of items,
+each formatted in turn with ``str.format()``. These may also contain repetitive
+material embedded in a similar manner, up to several levels deep. To illustrate
+how this works, here's an example where we are given a list of strings in the
+variable ``lines`` and we generate C code to write them out with ``fwrite()``::
+
+ sys.stdout.write(
+ '''#include <stdio.h>
+
+ int main(void) {{
+ {0:s}}}
+ '''.format(
+ ''.join(
+ [
+ ' fwrite(stdout, 1, {0:d}, "{1:s}");\n'.format(
+ len(line),
+ line
+ )
+ for line in lines
+ ]
+ )
+ )
+ )
--- /dev/null
+The ``element`` module
+======================
+
+.. automodule:: ndcode.piyacc.element
+ :members:
-.. πyacc documentation master file, created by
- sphinx-quickstart on Sat Feb 15 22:14:34 2020.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Implementation of the πyacc parser generator
-============================================
+Welcome to πyacc's documentation
+================================
.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
-The parser generator proceeds through the following stages:
-
-1. Processing the command line
-2. Parsing and analyzing the YACC/Bison-compatible input file
-3. Generating a parser definition and corresponding automaton
-4. Writing C or Python code to implement the requested parser
-
-Step 1 is fairly straightforward Python code using ``getopt.getopt()``.
-
-Step 2 is more complicated, it uses a πlex/πyacc/πtree generated parser which
-automatically creates an abstract syntax tree representing the input, and then
-it performs several passes over the tree to extract the needed information such
-as lists of terminals and non-terminals, the rules, the alternatives of each
-rule, and the symbols and/or actions and other directives of each alternative.
-How it does this is somewhat beyond the scope of this documentation, as we
-would first have to refer you to tutorials in how to use πlex/πyacc/πtree.
-
-Step 3 is the focus of this document. Internally it uses a Python class library
-that we have developed for dealing with LR1 grammars and automata. See the
-classes ``ndcode.piyacc.lr1.LR1`` and ``ndcode.piyacc.lr1dfa.LR1DFA``. These
-classes are general-purpose and there is nothing to stop you pulling them out
-and using them in your project (subject to the license which is GPLv2). At the
-moment the classes are only used for generating automata to be used later, but
-there are commented functions in each class that can perform parsing directly.
-
-Step 4 is fairly straightforward Python code using ``str.format()``. Basically
-it reads a skeleton file line-by-line looking for special lines similar to::
-
- # GENERATE SECTION1
-
-and then expands these into a sequence like::
-
- # GENERATE SECTION1 BEGIN
- ...
- # GENERATE SECTION1 END
-
-where the middle part (denoted ``...``) is printed with a ``str.format()``
-format string containing the C or Python code to be inserted into the skeleton.
-
-Often this C or Python code will contain repetitive material (e.g. switch cases
-or action handler functions or similar) and these are sent into the format
-string as an string argument, which is a ``str.join()`` of a list of items,
-each formatted in turn with ``str.format()``. These may also contain repetitive
-material embedded in a similar manner, up to several levels deep. To illustrate
-how this works, here's an example where we are given a list of strings in the
-variable ``lines`` and we generate C code to write them out with ``fwrite()``::
-
- sys.stdout.write(
- '''#include <stdio.h>
-
- int main(void) {{
- {0:s}}}
- '''.format(
- ''.join(
- [
- ' fwrite(stdout, 1, {0:d}, "{1:s}");\n'.format(
- len(line),
- line
- )
- for line in lines
- ]
- )
- )
- )
-
-
-LR1 parser
-==========
-
-The parser is structured as a list of productions, each of which describes a
-consecutive series of symbols that can be "reduced" to a new symbol.
-
-Traditionally an LR1 parser is defined as a collection of head symbols, each
-of which has a collection of alternatives, each a string of symbols, e.g.::
-
- expr <- number | binary_expr | '(' expr ')'
- binary_expr <- unary_expr | binary_expr binary_op unary_expr
- unary_expr <- expr | unary_op unary_expr
- binary_op <- '+' | '-'
- unary_op <- '-'
-
-Here we take a different approach by defining the LR1 parser as a collection
-of productions, each of which reduces to a unique head symbol. Then instead of
-referring to a head symbol which has alternatives, at each reference we list
-out the alternative head symbols explicitly. For example, based on the above,
-we would first have to split each head symbol into several indepedent ones::
-
- expr0 <- num
- expr1 <- bin_expr
- expr2 <- '(' expr ')'
- bin_expr0 <- un_expr
- bin_expr1 <- bin_expr bin_op un_expr
- un_expr0 <- expr
- un_expr1 <- un_op un_expr
- bin_op0 <- '+'
- bin_op1 <- '-'
- un_op0 <- '-'
-
-Then, each symbol would have to be changed into a set of acceptable symbols::
-
- expr0 <- {num}
- expr1 <- {bin_expr0, bin_expr1}
- expr2 <- {'('} {expr0, expr1, expr2} {')'}
- bin_expr0 <- {un_expr0, un_expr1}
- bin_expr1 <- {bin_expr0, bin_expr1} {bin_op0, bin_op1} {un_expr0, un_expr1}
- un_expr0 <- {expr0, expr1, expr2}
- un_expr1 <- {un_op0} {un_expr0, un_expr1}
- bin_op0 <- {'+'}
- bin_op1 <- {'-'}
- un_op0 <- {'-'}
-
-Whilst our representation is equivalent, it is more efficient in several ways:
-
-* It can handle simple character input streams itself without needing a
- lexical analysis front end. With the lexical analysis front-end, the parser
- receives a stream of lexical tokens. A typical ``lex`` definition for how
- to recognize a token ``NUM`` and return it to the parser might look like::
-
- [0-9]+ return NUM;
-
- Whereas, without the front-end, the parser receives a stream of characters,
- these might be ASCII characters in the range ``0..0xff``. An equivalent rule
- to the above could be written in the traditional parser language as follows::
-
- digit <- '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
- num <- digit | num digit
-
- Or in our parser language (using symbol sets rather than alternatives) by::
-
- num0 <- {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
- num1 <- {num0, num1} {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
-
- We find the second way to be more efficient, since the parser now naturally
- supports character classes, and repurposes the same mechanism to implement
- alternatives. This principle is carried through to the generated automaton,
- so that the ``LR1DFA`` recognizes character classes the same way as a regular
- ``DFA`` automaton (such as would be generated for the ``lex``-based example).
-
-* It is common to see (non-recursive) head symbols where all alternatives have
- length one, and these can be expanded in-place and eliminated, since the
- framework has natural support for sets. Furthermore, after splitting each
- head symbol into separate productions, some productions might be eliminated::
-
- expr1 <- {bin_expr0, bin_expr1}
- expr2 <- {'('} {num, expr1, expr2} {')'}
- bin_expr0 <- {un_expr0, un_expr1}
- bin_expr1 <- {bin_expr0, bin_expr1} {'+', '-'} {un_expr0, un_expr1}
- un_expr0 <- {num, expr1, expr2}
- un_expr1 <- {'-'} {un_expr0, un_expr1}
-
-Symbols are mapped to integers. The usual mapping has the following ranges:
-
-* ``0..N_CHARACTERS - 1`` represent single characters, or possibly if a lexical
- analyzer front-end is being used, shorthand for particular lexical symbols;
-* ``N_CHARACTERS..n_terminals - 1`` represent terminal symbols which are not
- single characters, usually meaning a lexical analyzer front-end is used;
-* ``n_terminals..n_terminals + len(productions) - 1`` represent nonterminal
- symbols, i.e. the head symbols to which the productions can be reduced.
-
-A special symbol called "eof_terminal" is defined, which must be in the range
-``N_CHARACTERS <= eof_terminal < n_terminals``.
-
-If a lexical analyzer front-end is not used, ``eof_terminal`` will be the only
-terminal other than the characters, so assuming 8-bit ASCII, the characters
-will be ``0..0xff``, EOF will be ``0x100``, and nonterminals will be ``0x101``
-and above.
-
-In our scheme, nonterminal symbols do not have alternatives, meaning that each
-production reduces to a unique nonterminal. Hence there is a 1-1 mapping
-between productions and nonterminal symbols, and the symbol number is obtained
-by taking the position of the production in the productions list and adding
-the symbol number of the first nonterminal, which we call ``n_terminals``
-above. Thus it is not necessary to store what symbol a production reduces to.
-
-Since symbols are integers, symbol sets are sets of integers. We represent a
-set of integers by a list of intervals, so that we can efficiently represent
-sets of consecutive characters specified in a manner similar to character
-classes in regexes. For example, the character class ``[0-9A-Za-z]`` contains
-the ASCII intervals ``[0x30, 0x3a), [0x41, 0x5b), [0x61, 0x7b)``. Note how the
-larger end of the interval is non-inclusive. We then just flatten out the
-interval pairs into a list like ``[0x30, 0x3a, 0x41, 0x5b, 0x61, 0x7b]``. In
-this way even positions give the start of included characters, odd positions
-dl the start of excluded characters. Adjacent intervals must be merged, in such
-a way that the list is always strictly increasing. It must always be even in
-length.
-
-Then the internal representation of a grammar is quite simple, as it is just a
-list of productions, each of which is a list of symbol sets, each of which is
-a list of breaks (i.e. starting symbol numbers of included/excluded symbols).
-Production number 0 is always the start production, and it must end with EOF.
-
-We keep symbol sets separated into terminals and non-terminals, so each symbol
-set is actually two lists, not one. This is convenient for computing lookahead
-sets and generating the automaton, since terminals must be treated differently,
-so it would be annoying to have to split the list into two at each step. Note
-that in this format the non-terminals are not offset by ``n_terminals`` so the
-numbers in the non-terminals set are simply the indices into ``productions``.
-
-We allow the user to store a reference data item with each production, and we
-also keep a workspace for computing the production's lookahead sets and whether
-the production can be empty. This is calculated by scanning the production in a
-forward direction, checking for each element of each symbol set the element's
-lookahead set and can-be-empty status. We save the intermediate results of the
-scan, i.e. the lookahead set and nullability of each prefix of each production.
-
-Additionally, in a separate data structure (like the sets above, but as a map
-not a set) we keep associativity and precedence, for resolving shift/reduce
-conflicts. For compatibility, the algorithm is deliberately identical to YACC.
-
-.. automodule:: ndcode.piyacc.lr1
- :members:
+ :maxdepth: 2
+ :caption: Contents:
+
+ bisect_set
+ bison_lr1dfa
+ cli
+ element
+ lr1
Indices and tables
==================
--- /dev/null
+The ``lr1`` module
+==================
+
+.. automodule:: ndcode.piyacc.lr1
+
+Design of the module
+--------------------
+
+The parser is structured as a list of productions, each of which describes a
+consecutive series of symbols that can be "reduced" to a new symbol.
+
+Traditionally an LR1 parser is defined as a collection of head symbols, each
+of which has a collection of alternatives, each a string of symbols, e.g.::
+
+ expr <- number | binary_expr | '(' expr ')'
+ binary_expr <- unary_expr | binary_expr binary_op unary_expr
+ unary_expr <- expr | unary_op unary_expr
+ binary_op <- '+' | '-'
+ unary_op <- '-'
+
+Here we take a different approach by defining the LR1 parser as a collection
+of productions, each of which reduces to a unique head symbol. Then instead of
+referring to a head symbol which has alternatives, at each reference we list
+out the alternative head symbols explicitly. For example, based on the above,
+we would first have to split each head symbol into several indepedent ones::
+
+ expr0 <- num
+ expr1 <- bin_expr
+ expr2 <- '(' expr ')'
+ bin_expr0 <- un_expr
+ bin_expr1 <- bin_expr bin_op un_expr
+ un_expr0 <- expr
+ un_expr1 <- un_op un_expr
+ bin_op0 <- '+'
+ bin_op1 <- '-'
+ un_op0 <- '-'
+
+Then, each symbol would have to be changed into a set of acceptable symbols::
+
+ expr0 <- {num}
+ expr1 <- {bin_expr0, bin_expr1}
+ expr2 <- {'('} {expr0, expr1, expr2} {')'}
+ bin_expr0 <- {un_expr0, un_expr1}
+ bin_expr1 <- {bin_expr0, bin_expr1} {bin_op0, bin_op1} {un_expr0, un_expr1}
+ un_expr0 <- {expr0, expr1, expr2}
+ un_expr1 <- {un_op0} {un_expr0, un_expr1}
+ bin_op0 <- {'+'}
+ bin_op1 <- {'-'}
+ un_op0 <- {'-'}
+
+Whilst our representation is equivalent, it is more efficient in several ways:
+
+* It can handle simple character input streams itself without needing a
+ lexical analysis front end. With the lexical analysis front-end, the parser
+ receives a stream of lexical tokens. A typical ``lex`` definition for how
+ to recognize a token ``NUM`` and return it to the parser might look like::
+
+ [0-9]+ return NUM;
+
+ Whereas, without the front-end, the parser receives a stream of characters,
+ these might be ASCII characters in the range ``0..0xff``. An equivalent rule
+ to the above could be written in the traditional parser language as follows::
+
+ digit <- '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
+ num <- digit | num digit
+
+ Or in our parser language (using symbol sets rather than alternatives) by::
+
+ num0 <- {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
+ num1 <- {num0, num1} {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
+
+ We find the second way to be more efficient, since the parser now naturally
+ supports character classes, and repurposes the same mechanism to implement
+ alternatives. This principle is carried through to the generated automaton,
+ so that the ``LR1DFA`` recognizes character classes the same way as a regular
+ ``DFA`` automaton (such as would be generated for the ``lex``-based example).
+
+* It is common to see (non-recursive) head symbols where all alternatives have
+ length one, and these can be expanded in-place and eliminated, since the
+ framework has natural support for sets. Furthermore, after splitting each
+ head symbol into separate productions, some productions might be eliminated::
+
+ expr1 <- {bin_expr0, bin_expr1}
+ expr2 <- {'('} {num, expr1, expr2} {')'}
+ bin_expr0 <- {un_expr0, un_expr1}
+ bin_expr1 <- {bin_expr0, bin_expr1} {'+', '-'} {un_expr0, un_expr1}
+ un_expr0 <- {num, expr1, expr2}
+ un_expr1 <- {'-'} {un_expr0, un_expr1}
+
+Symbols are mapped to integers. The usual mapping has the following ranges:
+
+* ``0..N_CHARACTERS - 1`` represent single characters, or possibly if a lexical
+ analyzer front-end is being used, shorthand for particular lexical symbols;
+* ``N_CHARACTERS..n_terminals - 1`` represent terminal symbols which are not
+ single characters, usually meaning a lexical analyzer front-end is used;
+* ``n_terminals..n_terminals + len(productions) - 1`` represent nonterminal
+ symbols, i.e. the head symbols to which the productions can be reduced.
+
+A special symbol called "eof_terminal" is defined, which must be in the range
+``N_CHARACTERS <= eof_terminal < n_terminals``.
+
+If a lexical analyzer front-end is not used, ``eof_terminal`` will be the only
+terminal other than the characters, so assuming 8-bit ASCII, the characters
+will be ``0..0xff``, EOF will be ``0x100``, and nonterminals will be ``0x101``
+and above.
+
+In our scheme, nonterminal symbols do not have alternatives, meaning that each
+production reduces to a unique nonterminal. Hence there is a 1-1 mapping
+between productions and nonterminal symbols, and the symbol number is obtained
+by taking the position of the production in the productions list and adding
+the symbol number of the first nonterminal, which we call ``n_terminals``
+above. Thus it is not necessary to store what symbol a production reduces to.
+
+Since symbols are integers, symbol sets are sets of integers. We represent a
+set of integers using a list of breaks -- see the ``bisect_set`` module.
+
+Then the internal representation of a grammar is quite simple, as it is just a
+list of productions, each of which is a list of symbol sets, each of which is
+a list of breaks (i.e. starting symbol numbers of included/excluded symbols).
+Production number 0 is always the start production, and it must end with EOF.
+
+We keep symbol sets separated into terminals and non-terminals, so each symbol
+set is actually two lists, not one. This is convenient for computing lookahead
+sets and generating the automaton, since terminals must be treated differently,
+so it would be annoying to have to split the list into two at each step. Note
+that in this format the non-terminals are not offset by ``n_terminals``, so the
+numbers in the non-terminals set are simply the indices into ``productions``.
+
+We allow the user to store a reference data item with each production, and we
+also keep a workspace for computing the production's lookahead sets and whether
+the production can be empty. This is calculated by scanning the production in a
+forward direction, checking for each element of each symbol set the element's
+lookahead set and can-be-empty status. We save the intermediate results of the
+scan, i.e. the lookahead set and nullability of each prefix of each production.
+
+Additionally, in a separate data structure we keep a map from nonterminals to
+associativity and precedence, for resolving shift/reduce conflicts. For
+compatibility, the algorithm is deliberately identical to YACC. The map is kept
+in a similar way to the sets discussed earlier -- see the ``bisect_set`` module
+for how we keep sets of integers. For maps from integers to arbitary objects
+we keep two lists: a list of ``breaks``, and a list of ``objects``. Then symbol
+``symbol`` maps to ``objects[i]`` for the smallest ``i`` such that ``symbol <
+breaks[i]``, or if none of these holds, it maps to ``objects[len(breaks)]``.
--- /dev/null
+# Copyright (C) 2019 Nick Downing <nick@ndcode.org>
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 51
+# Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+'''
+Module which provides the base class for πtree Abstract Syntax Trees (ASTs) and
+provides serialization/deserialization. The code generated by πtree will define
+specific ASTs for your project, as derived classes of that defined here.
+
+Generally you shouldn't check this file in to your project. Instead, at around
+the same time you run πtree to generate your ``t_def.py`` or similar file, also
+install this module by using the ``pitree --install-element`` command.
+'''
+
+import xml.etree.ElementTree
+
+class Element:
+ '''Class which holds a single node of a πtree AST.'''
+
+ tag = 'Element'
+ def __init__(self, text = None, children = None):
+ self.visited = None
+ '''
+During serialization, this indicates nodes that have been encountered before,
+so that DAGs or circular constructs can be serialized and reconstructed later.
+It contains either ``None`` or ``(element, ref, seen)``.
+
+Note that it is not allowed to have multiple use of direct children (direct
+children are nodes in the ``self.children`` list). That is, a node may have at
+most one direct parent. Any DAGs or circular constructs must be via fields.
+Fields are added by creating a derived class and overriding the serialization
+methods appropriately (or using the πtree generator, which does this for you).
+'''
+ self.children = [] if children is None else children
+ '''
+Contains the direct child nodes, also of class ``Element`` or a derived class
+of ``Element``.
+
+* Often the number of children and their meanings will be known in advance. For
+ example, a ``BinaryExpression`` class might have left and right children,
+ accessed via ``children[0]`` and ``children[1]``.
+* Sometimes the number of children can be arbitrary. For example, a
+ ``Function`` class might contain an arbitrary number of statements as direct
+ children.
+
+It is expected that the types of the children will be known statically. In the
+``BinaryExpression`` example, the children would be of class ``Expression`` or
+a derived class of ``Expression``. In the ``Function`` example, the children
+would be of class ``Statement`` or a derived class of ``Statement``. When the
+children are implicitly a tuple the children can be typed independently of one
+another. When they are implicitly a list they should ideally have uniform type.
+
+If no ``children`` argument is passed to the constructor, it will default to
+``None`` and then be internally translated to a freshly constructed empty list
+``[]``. This ensures that it is mutable and not shared with other instances.
+'''
+ assert isinstance(self.children, list)
+ self.text = (
+ ['' for i in range(len(self.children) + 1)]
+ if text is None else
+ text
+ )
+ '''
+Contains strings of text to interpolate between the child nodes. Must have
+length ``len(self.children) + 1``. So, for example, if there are two child
+nodes the contents of the node can be conceptualized as ``text[0] children[0]
+text[1] children[1] text[2]``.
+
+* For nodes with children, the text is often not very significant and may be
+ set to all empty strings. For example, a ``BinaryExpression`` node could have
+ ``self.text == ['', '', '']`` and only ``children[0]`` and ``children[1]``
+ significant. On the other hand, it could store the operator as a string, such
+ as ``'+'`` or ``'-'``, in the ``text[1]`` field, if needed. This would print
+ in a natural way. A combination of approaches is also possible, so that text
+ isn't significant, but would be filled in before pretty-printing the tree.
+* For nodes with no children, often the ``text[0]`` value holds the content of
+ the node. For example, an ``Identifier`` node would usually store its
+ identifier string in ``text[0]``. Again, this would print in a natural way.
+
+If no ``text`` argument is passed to the constructor, it will default to
+``None`` and then be internally translated to a freshly constructed list of
+the correct number of empty strings. This ensures that it is the right length
+and also that it is mutable and not shared with other instances.
+'''
+ assert isinstance(self.text, list)
+ assert len(self.text) == len(self.children) + 1
+
+ def serialize(self, ref_list):
+ '''
+Internal routine that supports serialization. It should not be called directly
+-- use ``element.serialize()`` instead. This method converts ``self`` into an
+``xml.etree.ElementTree`` node, populated with the recursive conversion of its
+children. It will be overridden in a derived class if the class has fields,
+to populate the attributes of the returned ``xml.etree.ElementTree`` node.
+'''
+ element = self.visited[0]
+ if len(self.text[0]):
+ element.text = self.text[0]
+ for i in range(len(self.children)):
+ child = self.children[i]
+ if child.visited is None:
+ child_element = xml.etree.ElementTree.Element(child.tag)
+ child.visited = (child_element, -1, True)
+ child.serialize(ref_list)
+ else:
+ child_element, child_ref, child_seen = child.visited
+ assert not child_seen
+ child.visited = (child_element, child_ref, True)
+ if len(self.text[i + 1]):
+ child_element.tail = self.text[i + 1]
+ element.append(child_element)
+ return element
+ def deserialize(self, element, ref_list):
+ '''
+Internal routine that supports deserialization. It should not be called
+directly -- use ``element.deserialize()`` instead. It will be overridden in a
+derived class if the class has fields, to set the fields from the attributes
+of the ``xml.etree.ElementTree`` node passed in as the ``element`` argument.
+'''
+ pass
+
+def serialize_ref(value, ref_list):
+ '''
+Internal routine to serialize a reference and return a value that can be placed
+in the attribute dictionary of an ``xml.etree.ElementTree`` node. It is meant
+to be called from the ``serialize()`` method of a derived class of ``Element``,
+for serializing fields that are references to an AST node or AST subtree.
+
+This is a special case, since other kinds of values (``int``, ``str`` etc) can
+be serialized by the ``json`` module, whereas references must be recursively
+converted. If the reference has already been serialized its value is returned
+directly, otherwise it will be added to the list ``ref_list`` and serialized.
+The value returned is its position in ``ref_list`` (it behaves like serializing
+an integer from that point on). The ``None`` value is serialized as ``-1``.
+'''
+ if value is None:
+ ref = -1
+ else:
+ if value.visited is None:
+ element = xml.etree.ElementTree.Element(value.tag)
+ ref = len(ref_list)
+ ref_list.append(value)
+ element.set('ref', str(ref))
+ value.visited = (element, ref, False)
+ value.serialize(ref_list)
+ else:
+ element, ref, seen = value.visited
+ if ref == -1:
+ ref = len(ref_list)
+ ref_list.append(value)
+ element.set('ref', str(ref))
+ value.visited = (element, ref, seen)
+ return ref
+
+def deserialize_ref(value, ref_list):
+ '''
+Internal routine to deserialize a reference and return an object of type
+``Element`` or a derived class of ``Element``. It is meant to be called from
+the ``deserialize()`` method of a derived class of ``Element``, for
+deserializing fields that are references to an AST node or AST subtree.
+
+The reference has already been processed as an integer and hence the ``value``
+is the position in ``ref_list`` where the referenced object is to be found,
+or ``-1``. Returns that object from ``ref_list``, or ``None`` for ``-1``.
+'''
+ assert value is not None
+ return None if value == -1 else ref_list[value]
+
+def serialize(value, fout, encoding = 'unicode'):
+ '''
+Front end to the serializer. Pass in an AST that is to be serialized and an
+output stream to place the serialized output on. The encoding should be passed
+as either ``'unicode'`` for encoding to standard output or ``'utf-8'`` for
+encoding to a file descriptor, this is a bit hacky and one should refer to the
+code of ``xml.etree.ElementTree`` to see what it really does with this value.
+
+The output stream will look something like::
+
+ <root>
+ <AnObject ref="0"><ANestedObject /></AnObject>
+ <AnotherObject ref="1" />
+ </root>
+
+The object with the attribute ``ref="0"`` corresponds to the ``value``
+parameter passed in here. Any direct children, grandchildren etc will be
+serialized inside it. Objects which are accessed through fields from those
+objects will be serialized separately and be given higher reference numbers.
+Note that those secondary objects can also have direct children, which are
+serialized inside those secondary objects, and so on. The ``<root>...</root>``
+element then ends up being a collection of all objects with no direct parent.
+'''
+ ref_list = []
+ serialize_ref(value, ref_list)
+ todo = [i for i in ref_list if not i.visited[2]]
+ root = xml.etree.ElementTree.Element('root')
+ root[:] = [i.visited[0] for i in todo]
+ root.text = '\n '
+ for i in range(len(root) - 1):
+ root[i].tail = '\n '
+ root[-1].tail = '\n'
+ root.tail = '\n'
+ xml.etree.ElementTree.ElementTree(root).write(fout, encoding)
+ i = 0
+ while i < len(todo):
+ node = todo[i]
+ node.visited = None
+ todo.extend(node.children)
+ i += 1
+
+def deserialize(fin, factory = Element, encoding = 'unicode'):
+ '''
+Front end to the deserializer. Essentially, reverses the process of
+``element.serialize()``. All the same comments apply to this function also.
+
+The tricky part with deserializing is knowing what kind of object to construct.
+For instance if the XML looks like this, ::
+
+ <root>
+ <AnObject ref="0">some text</AnObject>
+ </root>
+
+we want to find a constructor for a derived class of ``Element`` called
+``AnObject``. This is the role of the ``factory`` function that you pass in to
+``deserialize()``. It takes a tag name such as ``'AnObject'``, followed by the
+arguments to be passed into ``AnObject``'s constructor. Typically the
+``factory`` function will be written like this::
+
+ tag_to_class = {
+ 'AnObject': AnObject
+ }
+ def factory(tag, *args, **kwargs):
+ return tag_to_class[tag](*args, **kwargs)
+
+It is also possible to have a more complex factory function, for instance if
+you have defined an AST to use as a mini-language inside another AST, and you
+want to defer object creation to the mini-language's factory for those objects.
+'''
+ root = xml.etree.ElementTree.parse(fin).getroot()
+ assert root.tag == 'root'
+ children = [factory(i.tag) for i in root]
+ todo = list(zip(root, children))
+ i = 0
+ ref_list = []
+ while i < len(todo):
+ element, node = todo[i]
+ ref = element.get('ref')
+ if ref is not None:
+ ref = int(ref)
+ if len(ref_list) < ref + 1:
+ ref_list.extend([None] * (ref + 1 - len(ref_list)))
+ ref_list[ref] = node
+ children = [factory(i.tag) for i in element]
+ node.children = children
+ node.text = (
+ ['' if element.text is None else element.text] +
+ ['' if j.tail is None else j.tail for j in element]
+ )
+ todo.extend(zip(element, children))
+ i += 1
+ for element, node in todo:
+ node.deserialize(element, ref_list)
+ return ref_list[0]
+
+def to_text(root):
+ '''
+Convenience function to recursively extract all the text from a subtree.
+
+The result is similar to serializing the subtree (itself, its direct children,
+grandchildren etc) to XML, and then throwing away all XML tags and attributes.
+
+For example, if there are two child nodes, the value returned will be::
+
+ text[0] + to_text(children[0]) + text[1] + to_text(children[1] + text[2]
+'''
+ assert len(root.text) == len(root.children) + 1
+ return ''.join(
+ [
+ j
+ for i in range(len(root.children))
+ for j in [root.text[i], to_text(root[i])]
+ ] +
+ [root.text[-1]]
+ )
+
+def concatenate(children, factory = Element, *args, **kwargs):
+ '''
+Convenience function to concatenate an arbitrary number of nodes into one.
+
+The nodes are concatenated into a new empty node constructed by the ``factory``
+function that you specify. Only the text and children are taken from the nodes
+being concatenated, the types of the nodes and any data in fields are ignored.
+
+The ``factory`` argument is usually a constructor for an ``Element``-derived
+object, but it can also be any arbitrary function, and any further arguments
+sent after the ``factory`` argument will be sent into the ``factory`` call.
+
+For example, suppose node ``a`` has two children and node ``b`` has one. Then
+the call ``concatenate([a, b])`` is equivalent to::
+
+ Element(
+ children = [a.children[0], a.children[1], b.children[0]],
+ text = [a.text[0], a.text[1], a.text[2] + b.text[0], b.text[1]
+ )
+'''
+ root = factory(*args, **kwargs)
+ for child in children:
+ assert len(root.text) == len(root.children) + 1
+ assert len(child.text) == len(child.children) + 1
+ root.text[-1] += child.text[0]
+ root.children.extend(child.children)
+ root.text.extend(child.text[1:])
+ assert len(root.text) == len(root.children) + 1
+ return root
--- /dev/null
+piyacc:
+ ${MAKE} -C $@
+
+clean:
+ ${MAKE} -C piyacc clean
+
+.PHONY: piyacc clean
all: element.py lex_yy.py lex_yy_code.py t_def.py y_tab.py
-element.py: ../../bootstrap_pitree/skel/element.py
+element.py: ../../element.py
cat $< >$@
lex_yy.py: scan-gram.l ../../skel_lex_yy.py
clean:
rm -f element.py lex_yy.py lex_yy_code.py t_def.py y_tab.py
+
+.PHONY: all clean
# this program; if not, write to the Free Software Foundation, Inc., 51
# Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
-# defines the alphabet size, set this to 0x11000 for unicode
-n_characters = 0x100
+'''
+Module for dealing with sets of integers, or in particular, sets of characters
+from potentially large character sets such as Unicode. We can store and
+operate on these efficiently under the assumption that sets will contain mostly
+ranges of consecutive character codes.
+
+We encode character classes such as ``[0-9A-Z]`` as lists of breakpoints e.g.
+``[0x30, 0x3a, 0x41, 0x5b]``, which says that "included" characters begin at
+``0x30`` and ``0x41`` (the even breakpoints) whereas "excluded" characters
+begin at ``0x3a`` and ``0x5b`` (the odd breakpoints).
+'''
+
+N_CHARACTERS = 0x100
+'''Defines the alphabet size, set this to ``0x11000`` for Unicode.'''
def bisect_set_or(character_set0, character_set1):
- # calculate union of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "or" (max) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
+ '''
+Calculate union of the operand sets.
+
+We do this by calculating a series of breakpoints, at each breakpoint
+evaluating the "or" (max) of the even/odd truth values of each operand,
+then making the output truth value even/odd by outputting if necessary.
+'''
+
result = []
i = 0
j = 0
return result
def bisect_set_and(character_set0, character_set1):
- # calculate intersection of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "and" (min) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
+ '''
+Calculate intersection of the operand sets.
+
+We do this by calculating a series of breakpoints, at each breakpoint
+evaluating the "and" (min) of the even/odd truth values of each operand,
+then making the output truth value even/odd by outputting if necessary.
+'''
+
result = []
i = 0
j = 0
return result
def bisect_set_not(character_set):
- # calculate complement of the child set
- # if child set begins with [0], remove it, otherwise add [0] prefix
- # if child set ends with [n_characters], remove it, otherwise add [n_characters] suffix
- # the suffix part is not totally necessary, but makes sure length is even
- # (the evenness is so that single character sets can always be [c, c + 1])
+ '''
+Calculate complement of the operand set.
+
+* If operand set begins with ``[0]``, remove it, otherwise add ``[0]`` prefix.
+* If operand set ends with ``[N_CHARACTERS]``, remove it, otherwise add
+ ``[N_CHARACTERS]`` suffix.
+
+The suffix part is not totally necessary, but makes sure length is even
+(the evenness is so that single character sets can always be ``[c, c + 1]``).
+'''
+
result = list(character_set)
if result[:1] == [0]:
del result[:1]
else:
result[:0] = [0]
- if result[-1:] == [n_characters]:
+ if result[-1:] == [N_CHARACTERS]:
del result[-1:]
else:
- result.append(n_characters)
+ result.append(N_CHARACTERS)
return result
# this program; if not, write to the Free Software Foundation, Inc., 51
# Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+'''
+Module for constructing Bison-compatible automata. That is, automata that can
+be executed using Bison's default skeleton.
+'''
+
class BisonLR1DFA:
+ '''
+Class which holds a Bison-compatible automaton.
+
+Normally all of the fields of the class are computed ahead of time by
+``LR1DFA.to_bison_lr1dfa()`` and then passed into the constructor here.
+
+At the moment, this class doesn't have any methods, it simply holds the data
+which will be needed to populate a Bison skeleton. In the future we might add
+methods for executing the automaton directly on a stream of lexical tokens.
+'''
+
def __init__(
self,
translate,
# this program; if not, write to the Free Software Foundation, Inc., 51
# Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+'''
+Main module for the ``piyacc`` command-line utility.
+
+Contains the ``main()`` function as required by ``setuptools``, so that when
+the package is installed, ``setuptools`` can put a wrapper in ``/usr/bin`` or
+similar place. Can also be executed directly and will call ``main()`` itself.
+'''
+
import getopt
import os
import sys
EXIT_FAILURE = 1
def main():
+ '''
+Runs the following steps:
+
+1. Processing the command line
+2. Parsing and analyzing the YACC/Bison-compatible input file
+3. Generating a parser definition and corresponding automaton
+4. Writing C or Python code to implement the requested parser
+'''
+
home_dir = os.path.dirname(__file__)
try:
opts, args = getopt.getopt(
#from ndcode.piyacc import work
# defines the alphabet size, set this to 0x11000 for unicode
-n_characters = 0x100
+N_CHARACTERS = 0x100
class LR1DFA:
def __init__(
self,
states = [],
productions = [],
- n_terminals = n_characters + 1,
- eof_terminal = n_characters
+ n_terminals = N_CHARACTERS + 1,
+ eof_terminal = N_CHARACTERS
):
# states: list of state_desc
# state_desc: (terminal br, actions, nonterminal br, gotos, default reduce)