1 # Copyright (C) 2019 Nick Downing <nick@ndcode.org>
2 # SPDX-License-Identifier: GPL-2.0-only
4 # This program is free software; you can redistribute it and/or modify it under
5 # the terms of the GNU General Public License as published by the Free Software
6 # Foundation; version 2.
8 # This program is distributed in the hope that it will be useful, but WITHOUT
9 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # You should have received a copy of the GNU General Public License along with
14 # this program; if not, write to the Free Software Foundation, Inc., 51
15 # Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 Module which provides the base class for πtree Abstract Syntax Trees (ASTs) and
19 provides serialization/deserialization. The code generated by πtree will define
20 specific ASTs for your project, as derived classes of that defined here.
22 Generally you shouldn't check this file in to your project. Instead, at around
23 the same time you run πtree to generate your ``t_def.py`` or similar file, also
24 install this module by using the ``pitree --install-element`` command.
27 import xml.etree.ElementTree
30 '''Class which holds a single node of a πtree AST.'''
33 def __init__(self, text = None, children = None):
36 During serialization, this indicates nodes that have been encountered before,
37 so that DAGs or circular constructs can be serialized and reconstructed later.
38 It contains either ``None`` or ``(element, ref, seen)``.
40 Note that it is not allowed to have multiple use of direct children (direct
41 children are nodes in the ``self.children`` list). That is, a node may have at
42 most one direct parent. Any DAGs or circular constructs must be via fields.
43 Fields are added by creating a derived class and overriding the serialization
44 methods appropriately (or using the πtree generator, which does this for you).
46 self.children = [] if children is None else children
48 Contains the direct child nodes, also of class ``Element`` or a derived class
51 * Often the number of children and their meanings will be known in advance. For
52 example, a ``BinaryExpression`` class might have left and right children,
53 accessed via ``children[0]`` and ``children[1]``.
54 * Sometimes the number of children can be arbitrary. For example, a
55 ``Function`` class might contain an arbitrary number of statements as direct
58 It is expected that the types of the children will be known statically. In the
59 ``BinaryExpression`` example, the children would be of class ``Expression`` or
60 a derived class of ``Expression``. In the ``Function`` example, the children
61 would be of class ``Statement`` or a derived class of ``Statement``. When the
62 children are implicitly a tuple the children can be typed independently of one
63 another. When they are implicitly a list they should ideally have uniform type.
65 If no ``children`` argument is passed to the constructor, it will default to
66 ``None`` and then be internally translated to a freshly constructed empty list
67 ``[]``. This ensures that it is mutable and not shared with other instances.
69 assert isinstance(self.children, list)
71 ['' for i in range(len(self.children) + 1)]
76 Contains strings of text to interpolate between the child nodes. Must have
77 length ``len(self.children) + 1``. So, for example, if there are two child
78 nodes the contents of the node can be conceptualized as ``text[0] children[0]
79 text[1] children[1] text[2]``.
81 * For nodes with children, the text is often not very significant and may be
82 set to all empty strings. For example, a ``BinaryExpression`` node could have
83 ``self.text == ['', '', '']`` and only ``children[0]`` and ``children[1]``
84 significant. On the other hand, it could store the operator as a string, such
85 as ``'+'`` or ``'-'``, in the ``text[1]`` field, if needed. This would print
86 in a natural way. A combination of approaches is also possible, so that text
87 isn't significant, but would be filled in before pretty-printing the tree.
88 * For nodes with no children, often the ``text[0]`` value holds the content of
89 the node. For example, an ``Identifier`` node would usually store its
90 identifier string in ``text[0]``. Again, this would print in a natural way.
92 If no ``text`` argument is passed to the constructor, it will default to
93 ``None`` and then be internally translated to a freshly constructed list of
94 the correct number of empty strings. This ensures that it is the right length
95 and also that it is mutable and not shared with other instances.
97 assert isinstance(self.text, list)
98 assert len(self.text) == len(self.children) + 1
100 def serialize(self, ref_list):
102 Internal routine that supports serialization. It should not be called directly
103 -- use ``element.serialize()`` instead. This method converts ``self`` into an
104 ``xml.etree.ElementTree`` node, populated with the recursive conversion of its
105 children. It will be overridden in a derived class if the class has fields,
106 to populate the attributes of the returned ``xml.etree.ElementTree`` node.
108 element = self.visited[0]
109 if len(self.text[0]):
110 element.text = self.text[0]
111 for i in range(len(self.children)):
112 child = self.children[i]
113 if child.visited is None:
114 child_element = xml.etree.ElementTree.Element(child.tag)
115 child.visited = (child_element, -1, True)
116 child.serialize(ref_list)
118 child_element, child_ref, child_seen = child.visited
119 assert not child_seen
120 child.visited = (child_element, child_ref, True)
121 if len(self.text[i + 1]):
122 child_element.tail = self.text[i + 1]
123 element.append(child_element)
125 def deserialize(self, element, ref_list):
127 Internal routine that supports deserialization. It should not be called
128 directly -- use ``element.deserialize()`` instead. It will be overridden in a
129 derived class if the class has fields, to set the fields from the attributes
130 of the ``xml.etree.ElementTree`` node passed in as the ``element`` argument.
134 def serialize_ref(value, ref_list):
136 Internal routine to serialize a reference and return a value that can be placed
137 in the attribute dictionary of an ``xml.etree.ElementTree`` node. It is meant
138 to be called from the ``serialize()`` method of a derived class of ``Element``,
139 for serializing fields that are references to an AST node or AST subtree.
141 This is a special case, since other kinds of values (``int``, ``str`` etc) can
142 be serialized by the ``json`` module, whereas references must be recursively
143 converted. If the reference has already been serialized its value is returned
144 directly, otherwise it will be added to the list ``ref_list`` and serialized.
145 The value returned is its position in ``ref_list`` (it behaves like serializing
146 an integer from that point on). The ``None`` value is serialized as ``-1``.
151 if value.visited is None:
152 element = xml.etree.ElementTree.Element(value.tag)
154 ref_list.append(value)
155 element.set('ref', str(ref))
156 value.visited = (element, ref, False)
157 value.serialize(ref_list)
159 element, ref, seen = value.visited
162 ref_list.append(value)
163 element.set('ref', str(ref))
164 value.visited = (element, ref, seen)
167 def deserialize_ref(value, ref_list):
169 Internal routine to deserialize a reference and return an object of type
170 ``Element`` or a derived class of ``Element``. It is meant to be called from
171 the ``deserialize()`` method of a derived class of ``Element``, for
172 deserializing fields that are references to an AST node or AST subtree.
174 The reference has already been processed as an integer and hence the ``value``
175 is the position in ``ref_list`` where the referenced object is to be found,
176 or ``-1``. Returns that object from ``ref_list``, or ``None`` for ``-1``.
178 assert value is not None
179 return None if value == -1 else ref_list[value]
181 def serialize(value, fout, encoding = 'unicode'):
183 Front end to the serializer. Pass in an AST that is to be serialized and an
184 output stream to place the serialized output on. The encoding should be passed
185 as either ``'unicode'`` for encoding to standard output or ``'utf-8'`` for
186 encoding to a file descriptor, this is a bit hacky and one should refer to the
187 code of ``xml.etree.ElementTree`` to see what it really does with this value.
189 The output stream will look something like::
192 <AnObject ref="0"><ANestedObject /></AnObject>
193 <AnotherObject ref="1" />
196 The object with the attribute ``ref="0"`` corresponds to the ``value``
197 parameter passed in here. Any direct children, grandchildren etc will be
198 serialized inside it. Objects which are accessed through fields from those
199 objects will be serialized separately and be given higher reference numbers.
200 Note that those secondary objects can also have direct children, which are
201 serialized inside those secondary objects, and so on. The ``<root>...</root>``
202 element then ends up being a collection of all objects with no direct parent.
205 serialize_ref(value, ref_list)
206 todo = [i for i in ref_list if not i.visited[2]]
207 root = xml.etree.ElementTree.Element('root')
208 root[:] = [i.visited[0] for i in todo]
210 for i in range(len(root) - 1):
214 xml.etree.ElementTree.ElementTree(root).write(fout, encoding)
219 todo.extend(node.children)
222 def deserialize(fin, factory = Element, encoding = 'unicode'):
224 Front end to the deserializer. Essentially, reverses the process of
225 ``element.serialize()``. All the same comments apply to this function also.
227 The tricky part with deserializing is knowing what kind of object to construct.
228 For instance if the XML looks like this, ::
231 <AnObject ref="0">some text</AnObject>
234 we want to find a constructor for a derived class of ``Element`` called
235 ``AnObject``. This is the role of the ``factory`` function that you pass in to
236 ``deserialize()``. It takes a tag name such as ``'AnObject'``, followed by the
237 arguments to be passed into ``AnObject``'s constructor. Typically the
238 ``factory`` function will be written like this::
243 def factory(tag, *args, **kwargs):
244 return tag_to_class[tag](*args, **kwargs)
246 It is also possible to have a more complex factory function, for instance if
247 you have defined an AST to use as a mini-language inside another AST, and you
248 want to defer object creation to the mini-language's factory for those objects.
250 root = xml.etree.ElementTree.parse(fin).getroot()
251 assert root.tag == 'root'
252 children = [factory(i.tag) for i in root]
253 todo = list(zip(root, children))
257 element, node = todo[i]
258 ref = element.get('ref')
261 if len(ref_list) < ref + 1:
262 ref_list.extend([None] * (ref + 1 - len(ref_list)))
264 children = [factory(i.tag) for i in element]
265 node.children = children
267 ['' if element.text is None else element.text] +
268 ['' if j.tail is None else j.tail for j in element]
270 todo.extend(zip(element, children))
272 for element, node in todo:
273 node.deserialize(element, ref_list)
278 Convenience function to recursively extract all the text from a subtree.
280 The result is similar to serializing the subtree (itself, its direct children,
281 grandchildren etc) to XML, and then throwing away all XML tags and attributes.
283 For example, if there are two child nodes, the value returned will be::
285 text[0] + to_text(children[0]) + text[1] + to_text(children[1] + text[2]
287 assert len(root.text) == len(root.children) + 1
291 for i in range(len(root.children))
292 for j in [root.text[i], to_text(root[i])]
297 def concatenate(children, factory = Element, *args, **kwargs):
299 Convenience function to concatenate an arbitrary number of nodes into one.
301 The nodes are concatenated into a new empty node constructed by the ``factory``
302 function that you specify. Only the text and children are taken from the nodes
303 being concatenated, the types of the nodes and any data in fields are ignored.
305 The ``factory`` argument is usually a constructor for an ``Element``-derived
306 object, but it can also be any arbitrary function, and any further arguments
307 sent after the ``factory`` argument will be sent into the ``factory`` call.
309 For example, suppose node ``a`` has two children and node ``b`` has one. Then
310 the call ``concatenate([a, b])`` is equivalent to::
313 children = [a.children[0], a.children[1], b.children[0]],
314 text = [a.text[0], a.text[1], a.text[2] + b.text[0], b.text[1]
317 root = factory(*args, **kwargs)
318 for child in children:
319 assert len(root.text) == len(root.children) + 1
320 assert len(child.text) == len(child.children) + 1
321 root.text[-1] += child.text[0]
322 root.children.extend(child.children)
323 root.text.extend(child.text[1:])
324 assert len(root.text) == len(root.children) + 1