Skip to content

Instantly share code, notes, and snippets.

@kaharlichenko
Created June 2, 2012 08:57
Show Gist options
  • Save kaharlichenko/2857433 to your computer and use it in GitHub Desktop.
Save kaharlichenko/2857433 to your computer and use it in GitHub Desktop.
Parsing subset of regexp
#!/usr/bin/env python
# We will consider the following regular expressions:
#
# single characters # a
# regexp1 regexp2 # ab
# regexp * # a*
# regexp1 | regexp2 # a|b
# ( regexp ) # (a|b)* -- same as (?:a|b)
import ply.lex as lex
import ply.yacc as yacc
###############
# Lexer
###############
tokens = ('LETTER', 'STAR', 'BAR', 'LPAREN', 'RPAREN')
t_LETTER = r'[a-zA-Z0-9]'
t_STAR = r'\*'
t_BAR = r'\|'
t_LPAREN = r'\('
t_RPAREN = r'\)'
def t_error(t):
# XXX Mute for now
#print 'ERROR: Lexing error'
pass
##############
# Parser
##############
# The grammar:
# GRP -> ALT
# GRP -> ( GRP )
# ALT -> SEQ
# ALT -> ALT | SEQ
# SEQ -> REP
# SEQ -> SEQ REP
# REP -> letter
# REP ->
# REP -> GRP *
def p_grp_alt(p):
'grp : alt'
p[0] = p[1]
def p_grp(p):
'grp : LPAREN grp RPAREN'
p[0] = p[2]
def p_alt_seq(p):
'alt : seq'
p[0] = p[1]
def p_alt(p):
'alt : alt BAR seq'
if p[1][0] == 'any-of':
p[0] = ('any-of', p[1][1] + [p[3]])
else:
p[0] = ('any-of', [p[1], p[3]])
def p_seq_rep(p):
'seq : rep'
p[0] = p[1]
def p_seq(p):
'seq : seq rep'
if p[1] is None:
p[0] = p[2]
elif p[1][0] == 'sequence':
p[0] = ('sequence', p[1][1] + [p[2]])
else:
p[0] = ('sequence', [p[1], p[2]])
def p_rep_letter(p):
'rep : LETTER'
p[0] = ('letter', p[1])
def p_rep_empty(p):
'rep : '
p[0] = None
def p_rep_star(p):
'rep : grp STAR'
p[0] = ('zero-or-more', p[1])
#def p_error(p):
## XXX Mute for now
#print 'ERROR: Parsing error'
#pass
lexer = lex.lex()
parser = yacc.yacc()
input4c = 'ab|c'
output4c = ('any-of', [('sequence', [('letter', 'a'), ('letter', 'b')]), ('letter', 'c')])
print parser.parse(input4c, lexer=lexer)
#print parser.parse(input4c, lexer=lexer) == output4c
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment