Skip to content

Instantly share code, notes, and snippets.

@braised-babbage
Created March 30, 2019 22:46
Show Gist options
  • Save braised-babbage/074e6472070cb0a3cf09e11a96867881 to your computer and use it in GitHub Desktop.
Save braised-babbage/074e6472070cb0a3cf09e11a96867881 to your computer and use it in GitHub Desktop.
Regular Expressions in Python
"""Regular expression parser and interpreter.
One may check whether a string s matches a regular expression r by using the
`matches` function. To construct a regular expression object from its compact
textual form, use the `parse` function.
As a combined example,
>>> r = parse("a|b*(c)")
>>> matches(r, "bbbc")
True
A full grammar for regular expressions is defined in the comments below.
"""
from typing import Tuple, Optional
from abc import ABCMeta, abstractmethod
# Internal Representation for Regular Expressions
class RegEx(metaclass=ABCMeta):
@abstractmethod
def match(self, string: str) -> Tuple[Optional[str], str]:
"""Match the regular expression against a string.
On success, this should return the pair (m, r) where
m is the matched substring and r is the remaining substring.
On failure, return None, string.
"""
pass
@dataclass
class Empty(RegEx):
def match(self, string):
return "", string
@dataclass
class Primitive(RegEx):
char: str
def match(self, string):
if string and string[0] == self.char:
return self.char, string[1:]
else:
return None, string
@dataclass
class Choice(RegEx):
first: RegEx
second: RegEx
def match(self, string):
m1, s1 = self.first.match(string)
if m1 is not None:
return m1, s1
else:
return self.second.match(string)
@dataclass
class Sequence(RegEx):
first: RegEx
second: RegEx
def match(self, string):
m1, s1 = self.first.match(string)
if m1 is not None:
m2, s2 = self.second.match(s1)
if m2 is not None:
return m1 + m2, s2
return None, string
@dataclass
class Repetition(RegEx):
internal: RegEx
def match(self, string):
matched = None
while True:
m, s = self.internal.match(string)
if m is not None:
matched = m if matched is None else matched + m
string = s
else:
return matched, string
def matches(regex: RegEx, string: str) -> bool:
m, _ = regex.match(string)
return m is not None
# EBNF grammar for regular expressions
#
# <regex> ::= <term> '|' <regex> ; Choice
# | <term>
# <term> ::= { <factor> } ; Sequence
# <factor> ::= <base> { '*' } ; Repetition
# <base> ::= <char> ; Primitive
# | '\' <char> ; Primitive
# | '(' <regex> ')'
input = "a|b|(c*)"
# Scanner primitives
# Here a `token` is represented as a string of length 1.
def peek() -> str:
"""Get the next token, leaving the input buffer unmodified."""
return input[0]
def next() -> str:
"""Get the next token, removing it from the input buffer."""
val = peek()
eat(val)
return val
def eat(tok: str):
"""Consume a specific token from the input buffer."""
global input
if peek() == tok:
input = input[1:]
else:
raise ValueError(f"expected {tok} but instead got {peek()}")
def more() -> bool:
"""Do we have any more items in the input?"""
return len(input) > 0
# Recursive Descent Parser
# Each production rule of the above grammar corresponds to a top
# level function.
def regex() -> RegEx:
t = term()
if more() and peek() == "|":
eat("|")
return Choice(t, regex())
else:
return t
def term() -> RegEx:
f = Empty()
while more() and peek() != "|" and peek() != ")":
if isinstance(f, Empty):
f = factor()
else:
f = Sequence(f, factor())
return f
def factor() -> RegEx:
b = base()
while more() and peek() == "*":
eat("*")
b = Repetition(b)
return b
def base() -> RegEx:
p = peek()
if p == "(":
eat("(")
r = regex()
eat(")")
return r
elif p == "\\":
eat("\\")
return Primitive(next())
else:
return Primitive(next())
def parse(string) -> RegEx:
global input
input = string
return regex()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment