Fix several bugs to get Python scanner/parser basically working, processes ../tests...
[pilex.git] / regex.py
1 import bisect_set
2 import element
3 import nfa
4
5 # defines the alphabet size, set this to 0x11000 for unicode
6 n_characters = 0x100
7
8 class Regex(element.Element):
9   # GENERATE ELEMENT(int n_groups) BEGIN
10   def __init__(
11     self,
12     tag = 'Regex',
13     attrib = {},
14     text = '',
15     children = [],
16     n_groups = -1
17   ):
18     element.Element.__init__(
19       self,
20       tag,
21       attrib,
22       text,
23       children
24     )
25     self.n_groups = (
26       element.deserialize_int(n_groups)
27     if isinstance(n_groups, str) else
28       n_groups
29     )
30   def serialize(self, ref_list):
31     element.Element.serialize(self, ref_list)
32     self.set('n_groups', element.serialize_int(self.n_groups))
33   def deserialize(self, ref_list):
34     element.Element.deserialize(self, ref_list)
35     self.n_groups = element.deserialize_int(self.get('n_groups', '-1'))
36   def copy(self, factory = None):
37     result = element.Element.copy(
38       self,
39       Regex if factory is None else factory
40     )
41     result.n_groups = self.n_groups
42     return result
43   def repr_serialize(self, params):
44     element.Element.repr_serialize(self, params)
45     if self.n_groups != -1:
46       params.append(
47         'n_groups = {0:s}'.format(repr(self.n_groups))
48       )
49   def __repr__(self):
50     params = []
51     self.repr_serialize(params)
52     return 'regex.Regex({0:s})'.format(', '.join(params))
53   # GENERATE END
54   def post_process(self, groups, caseless = False):
55     self.n_groups = 0
56     for i in self:
57       i.post_process(groups, caseless)
58       self.n_groups += i.n_groups
59   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
60     raise NotImplementedError
61   def add_to_nfa(self, _nfa, group_ref_data):
62     _nfa.start_state.append(self.to_nfa_state(_nfa, group_ref_data, 0, 0))
63
64 class RegexNone(Regex):
65   # GENERATE ELEMENT() BEGIN
66   def __init__(
67     self,
68     tag = 'RegexNone',
69     attrib = {},
70     text = '',
71     children = [],
72     n_groups = -1
73   ):
74     Regex.__init__(
75       self,
76       tag,
77       attrib,
78       text,
79       children,
80       n_groups
81     )
82   def copy(self, factory = None):
83     result = Regex.copy(
84       self,
85       RegexNone if factory is None else factory
86     )
87     return result
88   def __repr__(self):
89     params = []
90     self.repr_serialize(params)
91     return 'regex.RegexNone({0:s})'.format(', '.join(params))
92   # GENERATE END
93   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
94     return -1
95
96 class RegexEmpty(Regex):
97   # GENERATE ELEMENT() BEGIN
98   def __init__(
99     self,
100     tag = 'RegexEmpty',
101     attrib = {},
102     text = '',
103     children = [],
104     n_groups = -1
105   ):
106     Regex.__init__(
107       self,
108       tag,
109       attrib,
110       text,
111       children,
112       n_groups
113     )
114   def copy(self, factory = None):
115     result = Regex.copy(
116       self,
117       RegexEmpty if factory is None else factory
118     )
119     return result
120   def __repr__(self):
121     params = []
122     self.repr_serialize(params)
123     return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
124   # GENERATE END
125   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
126     return next_state
127
128 class RegexCharacter(Regex):
129   # GENERATE ELEMENT(list(int) character_set) BEGIN
130   def __init__(
131     self,
132     tag = 'RegexCharacter',
133     attrib = {},
134     text = '',
135     children = [],
136     n_groups = -1,
137     character_set = []
138   ):
139     Regex.__init__(
140       self,
141       tag,
142       attrib,
143       text,
144       children,
145       n_groups
146     )
147     self.character_set = (
148       [element.deserialize_int(i) for i in character_set.split()]
149     if isinstance(character_set, str) else
150       character_set
151     )
152   def serialize(self, ref_list):
153     Regex.serialize(self, ref_list)
154     self.set(
155       'character_set',
156       ' '.join([element.serialize_int(i) for i in self.character_set])
157     )
158   def deserialize(self, ref_list):
159     Regex.deserialize(self, ref_list)
160     self.character_set = [
161       element.deserialize_int(i)
162       for i in self.get('character_set', '').split()
163     ]
164   def copy(self, factory = None):
165     result = Regex.copy(
166       self,
167       RegexCharacter if factory is None else factory
168     )
169     result.character_set = self.character_set
170     return result
171   def repr_serialize(self, params):
172     Regex.repr_serialize(self, params)
173     if len(self.character_set):
174       params.append(
175         'character_set = [{0:s}]'.format(
176           ', '.join([repr(i) for i in self.character_set])
177         )
178       )
179   def __repr__(self):
180     params = []
181     self.repr_serialize(params)
182     return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
183   # GENERATE END
184   def post_process(self, groups, caseless = False):
185     Regex.post_process(self, groups, caseless)
186     if caseless:
187       temp = bisect_set.bisect_set_and(
188         self.character_set,
189         [0x41, 0x5b, 0x61, 0x7b]
190       )
191       self.character_set = bisect_set.bisect_set_or(
192         self.character_set,
193         [i ^ 0x20 for i in temp if i >= 0x60] +
194         [i ^ 0x20 for i in temp if i < 0x60]
195       )
196   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
197     new_state = len(_nfa.states)
198     _nfa.states.append(
199       (nfa.NFA.STATE_CHARACTER, self.character_set, next_state)
200     )
201     return new_state
202
203 class RegexCharacterLiteral(RegexCharacter):
204   # GENERATE ELEMENT() BEGIN
205   def __init__(
206     self,
207     tag = 'RegexCharacterLiteral',
208     attrib = {},
209     text = '',
210     children = [],
211     n_groups = -1,
212     character_set = []
213   ):
214     RegexCharacter.__init__(
215       self,
216       tag,
217       attrib,
218       text,
219       children,
220       n_groups,
221       character_set
222     )
223   def copy(self, factory = None):
224     result = RegexCharacter.copy(
225       self,
226       RegexCharacterLiteral if factory is None else factory
227     )
228     return result
229   def __repr__(self):
230     params = []
231     self.repr_serialize(params)
232     return 'regex.RegexCharacterLiteral({0:s})'.format(', '.join(params))
233   # GENERATE END
234   def post_process(self, groups, caseless = False):
235     RegexCharacter.post_process(self, groups, False)
236     if caseless:
237       temp = bisect_set.bisect_set_and(
238         self.character_set,
239         [0x41, 0x5b, 0x61, 0x7b]
240       )
241       self.character_set = bisect_set.bisect_set_or(
242         self.character_set,
243         [i ^ 0x20 for i in temp if i >= 0x60] +
244         [i ^ 0x20 for i in temp if i < 0x60]
245       )
246
247 class RegexCharacterRange(RegexCharacter):
248   # GENERATE ELEMENT() BEGIN
249   def __init__(
250     self,
251     tag = 'RegexCharacterRange',
252     attrib = {},
253     text = '',
254     children = [],
255     n_groups = -1,
256     character_set = []
257   ):
258     RegexCharacter.__init__(
259       self,
260       tag,
261       attrib,
262       text,
263       children,
264       n_groups,
265       character_set
266     )
267   def copy(self, factory = None):
268     result = RegexCharacter.copy(
269       self,
270       RegexCharacterRange if factory is None else factory
271     )
272     return result
273   def __repr__(self):
274     params = []
275     self.repr_serialize(params)
276     return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
277   # GENERATE END
278   def post_process(self, groups, caseless = False):
279     RegexCharacter.post_process(self, groups, False)
280     self.character_set = [self[0].character_set[0], self[1].character_set[-1]]
281     if caseless:
282       temp = bisect_set.bisect_set_and(
283         self.character_set,
284         [0x41, 0x5b, 0x61, 0x7b]
285       )
286       self.character_set = bisect_set.bisect_set_or(
287         self.character_set,
288         [i ^ 0x20 for i in temp if i >= 0x60] +
289         [i ^ 0x20 for i in temp if i < 0x60]
290       )
291
292 class RegexCharacterOr(RegexCharacter):
293   # GENERATE ELEMENT() BEGIN
294   def __init__(
295     self,
296     tag = 'RegexCharacterOr',
297     attrib = {},
298     text = '',
299     children = [],
300     n_groups = -1,
301     character_set = []
302   ):
303     RegexCharacter.__init__(
304       self,
305       tag,
306       attrib,
307       text,
308       children,
309       n_groups,
310       character_set
311     )
312   def copy(self, factory = None):
313     result = RegexCharacter.copy(
314       self,
315       RegexCharacterOr if factory is None else factory
316     )
317     return result
318   def __repr__(self):
319     params = []
320     self.repr_serialize(params)
321     return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
322   # GENERATE END
323   def post_process(self, groups, caseless = False):
324     RegexCharacter.post_process(self, groups, caseless)
325     self.character_set = bisect_set.bisect_set_or(
326       self[0].character_set,
327       self[1].character_set
328     )
329
330 class RegexCharacterAnd(RegexCharacter):
331   # GENERATE ELEMENT() BEGIN
332   def __init__(
333     self,
334     tag = 'RegexCharacterAnd',
335     attrib = {},
336     text = '',
337     children = [],
338     n_groups = -1,
339     character_set = []
340   ):
341     RegexCharacter.__init__(
342       self,
343       tag,
344       attrib,
345       text,
346       children,
347       n_groups,
348       character_set
349     )
350   def copy(self, factory = None):
351     result = RegexCharacter.copy(
352       self,
353       RegexCharacterAnd if factory is None else factory
354     )
355     return result
356   def __repr__(self):
357     params = []
358     self.repr_serialize(params)
359     return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
360   # GENERATE END
361   def post_process(self, groups, caseless = False):
362     RegexCharacter.post_process(self, groups, caseless)
363     self.character_set = bisect_set.bisect_set_and(
364       self[0].character_set,
365       self[1].character_set
366     )
367
368 class RegexCharacterNot(RegexCharacter):
369   # GENERATE ELEMENT() BEGIN
370   def __init__(
371     self,
372     tag = 'RegexCharacterNot',
373     attrib = {},
374     text = '',
375     children = [],
376     n_groups = -1,
377     character_set = []
378   ):
379     RegexCharacter.__init__(
380       self,
381       tag,
382       attrib,
383       text,
384       children,
385       n_groups,
386       character_set
387     )
388   def copy(self, factory = None):
389     result = RegexCharacter.copy(
390       self,
391       RegexCharacterNot if factory is None else factory
392     )
393     return result
394   def __repr__(self):
395     params = []
396     self.repr_serialize(params)
397     return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
398   # GENERATE END
399   def post_process(self, groups, caseless = False):
400     RegexCharacter.post_process(self, groups, caseless)
401     self.character_set = bisect_set.bisect_set_not(self[0].character_set)
402
403 class RegexOr(Regex):
404   # GENERATE ELEMENT() BEGIN
405   def __init__(
406     self,
407     tag = 'RegexOr',
408     attrib = {},
409     text = '',
410     children = [],
411     n_groups = -1
412   ):
413     Regex.__init__(
414       self,
415       tag,
416       attrib,
417       text,
418       children,
419       n_groups
420     )
421   def copy(self, factory = None):
422     result = Regex.copy(
423       self,
424       RegexOr if factory is None else factory
425     )
426     return result
427   def __repr__(self):
428     params = []
429     self.repr_serialize(params)
430     return 'regex.RegexOr({0:s})'.format(', '.join(params))
431   # GENERATE END
432   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
433     child0_state = self[0].to_nfa_state(
434       _nfa,
435       group_ref_data,
436       group_index,
437       next_state
438     )
439     child1_state = self[1].to_nfa_state(
440       _nfa,
441       group_ref_data,
442       group_index + self[0].n_groups,
443       next_state
444     )
445     if child0_state == -1:
446       return child1_state
447     if child1_state == -1:
448       return child0_state
449     new_state = len(_nfa.states)
450     _nfa.states.append((nfa.NFA.STATE_OR, child0_state, child1_state))
451     return new_state
452
453 class RegexAnd(Regex):
454   # GENERATE ELEMENT() BEGIN
455   def __init__(
456     self,
457     tag = 'RegexAnd',
458     attrib = {},
459     text = '',
460     children = [],
461     n_groups = -1
462   ):
463     Regex.__init__(
464       self,
465       tag,
466       attrib,
467       text,
468       children,
469       n_groups
470     )
471   def copy(self, factory = None):
472     result = Regex.copy(
473       self,
474       RegexAnd if factory is None else factory
475     )
476     return result
477   def __repr__(self):
478     params = []
479     self.repr_serialize(params)
480     return 'regex.RegexAnd({0:s})'.format(', '.join(params))
481   # GENERATE END
482   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
483     join0_state = len(_nfa.states)
484     _nfa.states.append(nfa.NFA.join0_state) # no arguments so use static one
485     join1_state = len(_nfa.states)
486     _nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
487     child0_state = self[0].to_nfa_state(
488       _nfa,
489       group_ref_data,
490       group_index,
491       join0_state
492     )
493     if child0_state == -1:
494       return -1
495     child1_state = self[1].to_nfa_state(
496       _nfa,
497       group_ref_data,
498       group_index + self[0].n_groups,
499       join1_state
500     )
501     if child1_state == -1:
502       return -1
503     new_state = len(_nfa.states)
504     _nfa.states.append((nfa.NFA.STATE_AND, child0_state, child1_state))
505     return new_state
506
507 class RegexSequence(Regex):
508   # GENERATE ELEMENT() BEGIN
509   def __init__(
510     self,
511     tag = 'RegexSequence',
512     attrib = {},
513     text = '',
514     children = [],
515     n_groups = -1
516   ):
517     Regex.__init__(
518       self,
519       tag,
520       attrib,
521       text,
522       children,
523       n_groups
524     )
525   def copy(self, factory = None):
526     result = Regex.copy(
527       self,
528       RegexSequence if factory is None else factory
529     )
530     return result
531   def __repr__(self):
532     params = []
533     self.repr_serialize(params)
534     return 'regex.RegexSequence({0:s})'.format(', '.join(params))
535   # GENERATE END
536   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
537     next_state = self[1].to_nfa_state(
538       _nfa,
539       group_ref_data,
540       group_index + self[0].n_groups,
541       next_state
542     )
543     if next_state == -1:
544       return -1
545     return self[0].to_nfa_state(
546       _nfa,
547       group_ref_data,
548       group_index,
549       next_state
550     )
551
552 class RegexRepeat(Regex):
553   # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
554   def __init__(
555     self,
556     tag = 'RegexRepeat',
557     attrib = {},
558     text = '',
559     children = [],
560     n_groups = -1,
561     count0 = -1,
562     count1 = -1,
563     non_greedy = False
564   ):
565     Regex.__init__(
566       self,
567       tag,
568       attrib,
569       text,
570       children,
571       n_groups
572     )
573     self.count0 = (
574       element.deserialize_int(count0)
575     if isinstance(count0, str) else
576       count0
577     )
578     self.count1 = (
579       element.deserialize_int(count1)
580     if isinstance(count1, str) else
581       count1
582     )
583     self.non_greedy = (
584       element.deserialize_bool(non_greedy)
585     if isinstance(non_greedy, str) else
586       non_greedy
587     )
588   def serialize(self, ref_list):
589     Regex.serialize(self, ref_list)
590     self.set('count0', element.serialize_int(self.count0))
591     self.set('count1', element.serialize_int(self.count1))
592     self.set('non_greedy', element.serialize_bool(self.non_greedy))
593   def deserialize(self, ref_list):
594     Regex.deserialize(self, ref_list)
595     self.count0 = element.deserialize_int(self.get('count0', '-1'))
596     self.count1 = element.deserialize_int(self.get('count1', '-1'))
597     self.non_greedy = element.deserialize_bool(self.get('non_greedy', 'false'))
598   def copy(self, factory = None):
599     result = Regex.copy(
600       self,
601       RegexRepeat if factory is None else factory
602     )
603     result.count0 = self.count0
604     result.count1 = self.count1
605     result.non_greedy = self.non_greedy
606     return result
607   def repr_serialize(self, params):
608     Regex.repr_serialize(self, params)
609     if self.count0 != -1:
610       params.append(
611         'count0 = {0:s}'.format(repr(self.count0))
612       )
613     if self.count1 != -1:
614       params.append(
615         'count1 = {0:s}'.format(repr(self.count1))
616       )
617     if self.non_greedy != False:
618       params.append(
619         'non_greedy = {0:s}'.format(repr(self.non_greedy))
620       )
621   def __repr__(self):
622     params = []
623     self.repr_serialize(params)
624     return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
625   # GENERATE END
626   def post_process(self, groups, caseless = False):
627     # total hack which will be done in a Python action in future
628     if len(self) >= 2:
629       assert self[1].tag == 'Number'
630       self.count0 = int(self[1].text)
631       if len(self) >= 3:
632         assert self[2].tag == 'Number'
633         self.count1 = int(self[2].text)
634       else:
635         self.count1 = self.count0
636       del self[1:]
637     # end total hack
638     Regex.post_process(self, groups, caseless)
639   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
640     count0 = self.count0
641     count1 = self.count1
642     if count1 == -1:
643       new_state = len(_nfa.states)
644       _nfa.states.append(None)
645       child_state = self[0].to_nfa_state(
646         _nfa,
647         group_ref_data,
648         group_index,
649         new_state
650       )
651       if child_state != -1:
652         _nfa.states[new_state] = (
653           (nfa.NFA.STATE_OR, next_state, child_state)
654         if self.non_greedy else
655           (nfa.NFA.STATE_OR, child_state, next_state)
656         )
657         next_state = new_state
658     else:
659       done_state = next_state
660       for i in range(count1 - count0):
661         child_state = self[0].to_nfa_state(
662           _nfa,
663           group_ref_data,
664           group_index,
665           next_state
666         )
667         if child_state == -1:
668           break
669         new_state = len(_nfa.states)
670         _nfa.states.append(
671           (nfa.NFA.STATE_OR, done_state, child_state)
672         if self.non_greedy else
673           (nfa.NFA.STATE_OR, child_state, done_state)
674         )
675         next_state = new_state
676     for i in range(count0):
677       next_state = self[0].to_nfa_state(
678         _nfa,
679         group_ref_data,
680         group_index,
681         next_state
682       )
683       if next_state == -1:
684         break
685     return next_state
686
687 class RegexGroup(Regex):
688   # GENERATE ELEMENT() BEGIN
689   def __init__(
690     self,
691     tag = 'RegexGroup',
692     attrib = {},
693     text = '',
694     children = [],
695     n_groups = -1
696   ):
697     Regex.__init__(
698       self,
699       tag,
700       attrib,
701       text,
702       children,
703       n_groups
704     )
705   def copy(self, factory = None):
706     result = Regex.copy(
707       self,
708       RegexGroup if factory is None else factory
709     )
710     return result
711   def __repr__(self):
712     params = []
713     self.repr_serialize(params)
714     return 'regex.RegexGroup({0:s})'.format(', '.join(params))
715   # GENERATE END
716   def post_process(self, groups, caseless = False):
717     # we use -1 here because named or action groups use self[0] for text
718     groups.append(self)
719     self[-1].post_process(groups, caseless)
720     self.n_groups = self[-1].n_groups + 1
721   def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
722     new_state = len(_nfa.states)
723     _nfa.states.append(
724       (nfa.NFA.STATE_MARK, group_ref_data[group_index][1], next_state)
725     )
726     next_state = new_state
727     next_state = self[-1].to_nfa_state(
728       _nfa,
729       group_ref_data,
730       group_index + 1,
731       next_state
732     )
733     if next_state == -1:
734       return -1
735     new_state = len(_nfa.states)
736     _nfa.states.append(
737       (nfa.NFA.STATE_MARK, group_ref_data[group_index][0], next_state)
738     )
739     return new_state
740
741 # internal base class
742 class Text(element.Element):
743   # GENERATE ELEMENT() BEGIN
744   def __init__(
745     self,
746     tag = 'Text',
747     attrib = {},
748     text = '',
749     children = []
750   ):
751     element.Element.__init__(
752       self,
753       tag,
754       attrib,
755       text,
756       children
757     )
758   def copy(self, factory = None):
759     result = element.Element.copy(
760       self,
761       Text if factory is None else factory
762     )
763     return result
764   def __repr__(self):
765     params = []
766     self.repr_serialize(params)
767     return 'regex.Text({0:s})'.format(', '.join(params))
768   # GENERATE END
769   def get_text(self):
770     return element.get_text(self, 0)
771
772 class RegexGroupName(RegexGroup):
773   class Text(Text):
774     # GENERATE ELEMENT() BEGIN
775     def __init__(
776       self,
777       tag = 'RegexGroupName_Text',
778       attrib = {},
779       text = '',
780       children = []
781     ):
782       Text.__init__(
783         self,
784         tag,
785         attrib,
786         text,
787         children
788       )
789     def copy(self, factory = None):
790       result = Text.copy(
791         self,
792         Text if factory is None else factory
793       )
794       return result
795     def __repr__(self):
796       params = []
797       self.repr_serialize(params)
798       return 'regex.RegexGroupName.Text({0:s})'.format(', '.join(params))
799     # GENERATE END
800
801   # GENERATE ELEMENT() BEGIN
802   def __init__(
803     self,
804     tag = 'RegexGroupName',
805     attrib = {},
806     text = '',
807     children = [],
808     n_groups = -1
809   ):
810     RegexGroup.__init__(
811       self,
812       tag,
813       attrib,
814       text,
815       children,
816       n_groups
817     )
818   def copy(self, factory = None):
819     result = RegexGroup.copy(
820       self,
821       RegexGroupName if factory is None else factory
822     )
823     return result
824   def __repr__(self):
825     params = []
826     self.repr_serialize(params)
827     return 'regex.RegexGroupName({0:s})'.format(', '.join(params))
828   # GENERATE END
829
830 class RegexGroupAction(RegexGroup):
831   class Text(Text):
832     # GENERATE ELEMENT() BEGIN
833     def __init__(
834       self,
835       tag = 'RegexGroupAction_Text',
836       attrib = {},
837       text = '',
838       children = []
839     ):
840       Text.__init__(
841         self,
842         tag,
843         attrib,
844         text,
845         children
846       )
847     def copy(self, factory = None):
848       result = Text.copy(
849         self,
850         Text if factory is None else factory
851       )
852       return result
853     def __repr__(self):
854       params = []
855       self.repr_serialize(params)
856       return 'regex.RegexGroupAction.Text({0:s})'.format(', '.join(params))
857     # GENERATE END
858
859   # GENERATE ELEMENT() BEGIN
860   def __init__(
861     self,
862     tag = 'RegexGroupAction',
863     attrib = {},
864     text = '',
865     children = [],
866     n_groups = -1
867   ):
868     RegexGroup.__init__(
869       self,
870       tag,
871       attrib,
872       text,
873       children,
874       n_groups
875     )
876   def copy(self, factory = None):
877     result = RegexGroup.copy(
878       self,
879       RegexGroupAction if factory is None else factory
880     )
881     return result
882   def __repr__(self):
883     params = []
884     self.repr_serialize(params)
885     return 'regex.RegexGroupAction({0:s})'.format(', '.join(params))
886   # GENERATE END
887
888 class RegexGroupElement(RegexGroup):
889   class Text(Text):
890     # GENERATE ELEMENT() BEGIN
891     def __init__(
892       self,
893       tag = 'RegexGroupElement_Text',
894       attrib = {},
895       text = '',
896       children = []
897     ):
898       Text.__init__(
899         self,
900         tag,
901         attrib,
902         text,
903         children
904       )
905     def copy(self, factory = None):
906       result = Text.copy(
907         self,
908         Text if factory is None else factory
909       )
910       return result
911     def __repr__(self):
912       params = []
913       self.repr_serialize(params)
914       return 'regex.RegexGroupElement.Text({0:s})'.format(', '.join(params))
915     # GENERATE END
916
917   # GENERATE ELEMENT() BEGIN
918   def __init__(
919     self,
920     tag = 'RegexGroupElement',
921     attrib = {},
922     text = '',
923     children = [],
924     n_groups = -1
925   ):
926     RegexGroup.__init__(
927       self,
928       tag,
929       attrib,
930       text,
931       children,
932       n_groups
933     )
934   def copy(self, factory = None):
935     result = RegexGroup.copy(
936       self,
937       RegexGroupElement if factory is None else factory
938     )
939     return result
940   def __repr__(self):
941     params = []
942     self.repr_serialize(params)
943     return 'regex.RegexGroupElement({0:s})'.format(', '.join(params))
944   # GENERATE END
945  
946 # GENERATE FACTORY(element.Element) BEGIN
947 tag_to_class = {
948   'Regex': Regex,
949   'RegexNone': RegexNone,
950   'RegexEmpty': RegexEmpty,
951   'RegexCharacter': RegexCharacter,
952   'RegexCharacterLiteral': RegexCharacterLiteral,
953   'RegexCharacterRange': RegexCharacterRange,
954   'RegexCharacterOr': RegexCharacterOr,
955   'RegexCharacterAnd': RegexCharacterAnd,
956   'RegexCharacterNot': RegexCharacterNot,
957   'RegexOr': RegexOr,
958   'RegexAnd': RegexAnd,
959   'RegexSequence': RegexSequence,
960   'RegexRepeat': RegexRepeat,
961   'RegexGroup': RegexGroup,
962   'Text': Text,
963   'RegexGroupName': RegexGroupName,
964   'RegexGroupName_Text': RegexGroupName.Text,
965   'RegexGroupAction': RegexGroupAction,
966   'RegexGroupAction_Text': RegexGroupAction.Text,
967   'RegexGroupElement': RegexGroupElement,
968   'RegexGroupElement_Text': RegexGroupElement.Text
969 }
970 def factory(tag, attrib = {}, *args, **kwargs):
971   return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
972 # GENERATE END
973
974 if __name__ == '__main__':
975   import sys
976   import xml.etree.ElementTree
977   import wrap_repr
978
979   _regex = RegexAnd(
980     children = [
981       RegexRepeat(
982         children = [
983           RegexCharacterNot(
984            children = [
985              RegexCharacterLiteral()
986            ],
987            character_set = [0, 256]
988          )
989        ]
990       ),
991       RegexGroup(
992         children = [
993           RegexOr(
994             children = [
995               RegexOr(
996                 children = [
997                   RegexOr(
998                     children = [
999                       RegexOr(
1000                         children = [
1001                           RegexNone(),
1002                           RegexGroup(
1003                             children = [
1004                               RegexRepeat(
1005                                 children = [
1006                                   RegexCharacterLiteral(
1007                                     character_set = [9, 14, 32, 33]
1008                                   )
1009                                 ],
1010                                 count0 = 1
1011                               )
1012                             ],
1013                             index = 1,
1014                             name = 'Whitespace'
1015                           )
1016                         ]
1017                       ),
1018                       RegexGroup(
1019                         children = [
1020                           RegexRepeat(
1021                             children = [
1022                               RegexCharacterLiteral(
1023                                 character_set = [48, 58]
1024                               )
1025                             ],
1026                             count0 = 1
1027                           )
1028                         ],
1029                         index = 2,
1030                         name = 'Number'
1031                       )
1032                     ]
1033                   ),
1034                   RegexGroup(
1035                     children = [
1036                       RegexSequence(
1037                         children = [
1038                           RegexSequence(
1039                             children = [
1040                               RegexSequence(
1041                                 children = [
1042                                   RegexSequence(
1043                                     children = [
1044                                       RegexEmpty(),
1045                                       RegexCharacterLiteral(
1046                                         character_set = [102, 103]
1047                                       )
1048                                     ]
1049                                   ),
1050                                   RegexCharacterLiteral(
1051                                     character_set = [111, 112]
1052                                   )
1053                                 ]
1054                               ),
1055                               RegexCharacterLiteral(
1056                                 character_set = [114, 115]
1057                               )
1058                             ]
1059                           ),
1060                           RegexRepeat(
1061                             children = [
1062                               RegexCharacterLiteral(
1063                                 character_set = [101, 102]
1064                               )
1065                             ],
1066                             count0 = 0,
1067                             count1 = 1
1068                           )
1069                         ]
1070                       )
1071                     ],
1072                     index = 3,
1073                     name = 'For'
1074                   )
1075                 ]
1076               ),
1077               RegexGroup(
1078                 children = [
1079                   RegexSequence(
1080                     children = [
1081                       RegexCharacterLiteral(
1082                         character_set = [65, 91, 95, 96, 97, 123]
1083                       ),
1084                       RegexRepeat(
1085                         children = [
1086                           RegexCharacterLiteral(
1087                             character_set = [48, 58, 65, 91, 95, 96, 97, 123]
1088                           )
1089                         ]
1090                       )
1091                     ]
1092                   )
1093                 ],
1094                 index = 4,
1095                 name = 'Identifier'
1096               )
1097             ]
1098           )
1099         ],
1100         index = 0
1101       )
1102     ]
1103   )
1104   sys.stdout.write(
1105     wrap_repr.wrap_repr(
1106       '  _regex = {0:s}'.format(repr(_regex).replace('regex.', '')),
1107       79
1108     )
1109   )
1110
1111   _nfa = nfa.NFA()
1112   _regex.add_to_nfa(_nfa)
1113   sys.stdout.write(
1114     wrap_repr.wrap_repr(
1115       '  _nfa = {0:s}'.format(repr(_nfa).replace('nfa.', '')),
1116       79
1117     )
1118   )
1119
1120   text = '      id   99id id99 for fore foree forex  '
1121   i = 0
1122   while i < len(text):
1123     print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
1124     thread = _nfa.match_text(text, i)
1125     if thread is None:
1126       print('no match')
1127       break
1128     i = thread[0] # end position of overall match
1129     group_start = [-1 for j in range(len(_nfa.groups))]
1130     group_end = [-1 for j in range(len(_nfa.groups))]
1131     while thread is not None:
1132       pos, mark, thread = thread
1133       group = mark >> 1
1134       if (mark & 1) == 0:
1135         group_start[group] = pos
1136         print(
1137           'group {0:d} name "{1:s}" text "{2:s}"'.format(
1138              group,
1139              _nfa.groups[group][0],
1140              text[group_start[group]:group_end[group]].replace('\n', '$')
1141           )
1142         )
1143       else:
1144         group_end[group] = pos
1145
1146   _dfa = _nfa.to_dfa()
1147   sys.stdout.write(
1148     wrap_repr.wrap_repr(
1149       '  _dfa = {0:s}'.format(repr(_dfa).replace('dfa.', '')),
1150       79
1151     )
1152   )
1153
1154   text = '      id   99id id99 for fore foree forex  '
1155   i = 0
1156   while i < len(text):
1157     print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
1158     thread = _dfa.match_text(text, i)
1159     if thread is None:
1160       print('no match')
1161       break
1162     i = thread[0] # end position of overall match
1163     group_start = [-1 for j in range(len(_dfa.groups))]
1164     group_end = [-1 for j in range(len(_dfa.groups))]
1165     while thread is not None:
1166       pos, mark, thread = thread
1167       group = mark >> 1
1168       if (mark & 1) == 0:
1169         group_start[group] = pos
1170         print(
1171           'group {0:d} name "{1:s}" text "{2:s}"'.format(
1172              group,
1173              _dfa.groups[group][0],
1174              text[group_start[group]:group_end[group]].replace('\n', '$')
1175           )
1176         )
1177       else:
1178         group_end[group] = pos
1179
1180 # move this into grammar.py:
1181 #  grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
1182 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
1183 #= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
1184 #259, 262], rule_name = 'expr0')])], nonterminal = 0), Grammar.Production(
1185 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
1186 #= [262, 265], rule_name = 'expr1')])], nonterminal = 1), Grammar.Production(
1187 #children = [RegexSequence(children = [RegexEmpty(), RegexGroup(children = [
1188 #RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
1189 #RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262
1190 #], rule_name = 'expr0')]), RegexCharacter(character_set = [43, 44])]),
1191 #RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
1192 #RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')])], group_index
1193 #= 0, group_name = 'Add')])], nonterminal = 2), Grammar.Production(children = [
1194 #RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
1195 #children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
1196 #children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262], rule_name =
1197 #'expr0')]), RegexCharacter(character_set = [45, 46])]), RegexCharacterRule(character_set
1198 #= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
1199 #262, 265], rule_name = 'expr1')])], group_index = 0, group_name = 'Subtract')])
1200 #], nonterminal = 3), Grammar.Production(children = [RegexSequence(children = [
1201 #RegexEmpty(), RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])
1202 #], nonterminal = 4), Grammar.Production(children = [RegexSequence(children = [
1203 #RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
1204 #children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
1205 #RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
1206 #RegexCharacter(character_set = [42, 43])]), RegexCharacterRule(character_set = [288, 295
1207 #], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
1208 #rule_name = 'expr2')])], group_index = 0, group_name = 'Multiply')])],
1209 #nonterminal = 5), Grammar.Production(children = [RegexSequence(children = [
1210 #RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
1211 #children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
1212 #RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
1213 #RegexCharacter(character_set = [47, 48])]), RegexCharacterRule(character_set = [288, 295
1214 #], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
1215 #rule_name = 'expr2')])], group_index = 0, group_name = 'Divide')])],
1216 #nonterminal = 6), Grammar.Production(children = [RegexSequence(children = [
1217 #RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
1218 #children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
1219 #'number')])], group_index = 0, group_name = 'Number')]), RegexCharacterRule(
1220 #character_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
1221 #Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
1222 #RegexGroup(children = [RegexSequence(children = [RegexSequence(children = [
1223 #RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [45, 46])]),
1224 #RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
1225 #RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])], group_index
1226 #= 0, group_name = 'Negate')])], nonterminal = 8), Grammar.Production(children =
1227 #[RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
1228 #RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
1229 #RegexCharacter(character_set = [40, 41])]), RegexCharacterRule(character_set = [288, 295
1230 #], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [259, 262],
1231 #rule_name = 'expr0')]), RegexCharacter(character_set = [41, 42])]),
1232 #RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')])],
1233 #nonterminal = 9), Grammar.Production(children = [RegexSequence(children = [
1234 #RegexEmpty(), RegexCharacter(character_set = [48, 49])])], nonterminal = 10),
1235 #Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
1236 #RegexCharacter(character_set = [49, 50])])], nonterminal = 11), Grammar.Production(
1237 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
1238 #50, 51])])], nonterminal = 12), Grammar.Production(children = [RegexSequence(
1239 #children = [RegexEmpty(), RegexCharacter(character_set = [51, 52])])], nonterminal =
1240 #13), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
1241 #RegexCharacter(character_set = [52, 53])])], nonterminal = 14), Grammar.Production(
1242 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
1243 #53, 54])])], nonterminal = 15), Grammar.Production(children = [RegexSequence(
1244 #children = [RegexEmpty(), RegexCharacter(character_set = [54, 55])])], nonterminal =
1245 #16), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
1246 #RegexCharacter(character_set = [55, 56])])], nonterminal = 17), Grammar.Production(
1247 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
1248 #56, 57])])], nonterminal = 18), Grammar.Production(children = [RegexSequence(
1249 #children = [RegexEmpty(), RegexCharacter(character_set = [57, 58])])], nonterminal =
1250 #19), Grammar.Production(children = [RegexSequence(children = [RegexSequence(
1251 #children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
1252 #'number')]), RegexCharacter(character_set = [48, 49])])], nonterminal = 20),
1253 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1254 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1255 #)]), RegexCharacter(character_set = [49, 50])])], nonterminal = 21),
1256 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1257 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1258 #)]), RegexCharacter(character_set = [50, 51])])], nonterminal = 22),
1259 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1260 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1261 #)]), RegexCharacter(character_set = [51, 52])])], nonterminal = 23),
1262 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1263 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1264 #)]), RegexCharacter(character_set = [52, 53])])], nonterminal = 24),
1265 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1266 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1267 #)]), RegexCharacter(character_set = [53, 54])])], nonterminal = 25),
1268 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1269 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1270 #)]), RegexCharacter(character_set = [54, 55])])], nonterminal = 26),
1271 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1272 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1273 #)]), RegexCharacter(character_set = [55, 56])])], nonterminal = 27),
1274 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1275 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1276 #)]), RegexCharacter(character_set = [56, 57])])], nonterminal = 28),
1277 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1278 #= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
1279 #)]), RegexCharacter(character_set = [57, 58])])], nonterminal = 29),
1280 #Grammar.Production(children = [RegexEmpty()], nonterminal = 30),
1281 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1282 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1283 #'whitespace_opt')]), RegexCharacter(character_set = [9, 10])])], nonterminal = 31),
1284 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1285 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1286 #'whitespace_opt')]), RegexCharacter(character_set = [10, 11])])], nonterminal = 32),
1287 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1288 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1289 #'whitespace_opt')]), RegexCharacter(character_set = [11, 12])])], nonterminal = 33),
1290 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1291 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1292 #'whitespace_opt')]), RegexCharacter(character_set = [12, 13])])], nonterminal = 34),
1293 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1294 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1295 #'whitespace_opt')]), RegexCharacter(character_set = [13, 14])])], nonterminal = 35),
1296 #Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
1297 #= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
1298 #'whitespace_opt')]), RegexCharacter(character_set = [32, 33])])], nonterminal = 36)
1299 #], n_terminals = 258)
1300 #  #sys.stdout.write(
1301 #  #  wrap_repr.wrap_repr(
1302 #  #    '  grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),
1303 #  #    79
1304 #  #  )
1305 #  #)
1306 #
1307 #  lr1 = grammar.to_lr1()
1308 #  #sys.stdout.write(
1309 #  #  wrap_repr.wrap_repr(
1310 #  #    '  lr1 = {0:s}'.format(repr(lr1).replace('regex.', '')),
1311 #  #    79
1312 #  #  )
1313 #  #)
1314 #
1315 #  lr1.parse_text('(13 + 5 * 6) * 2', 0)
1316 #  root = element.Element('root', text = '(13 + 5 * 6) * 2')
1317 #  lr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
1318 #  xml.etree.ElementTree.dump(root)
1319 #
1320 #  clr1 = lr1.to_clr1()
1321 #  #sys.stdout.write(
1322 #  #  wrap_repr.wrap_repr(
1323 #  #    '  clr1 = {0:s}'.format(repr(clr1).replace('regex.', '')),
1324 #  #    79
1325 #  #  )
1326 #  #)
1327 #
1328 #  clr1.parse_text('(13 + 5 * 6) * 2', 0)
1329 #  root = element.Element('root', text = '(13 + 5 * 6) * 2')
1330 #  clr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
1331 #  xml.etree.ElementTree.dump(root)
1332 #
1333 #  lalr1 = lr1.to_lalr1()
1334 #  #sys.stdout.write(
1335 #  #  wrap_repr.wrap_repr(
1336 #  #    '  lalr1 = {0:s}'.format(repr(lalr1).replace('regex.', '')),
1337 #  #    79
1338 #  #  )
1339 #  #)
1340 #
1341 #  lalr1.parse_text('(13 + 5 * 6) * 2', 0)
1342 #  root = element.Element('root', text = '(13 + 5 * 6) * 2')
1343 #  lalr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
1344 #  xml.etree.ElementTree.dump(root)