Created
March 30, 2019 22:46
-
-
Save braised-babbage/074e6472070cb0a3cf09e11a96867881 to your computer and use it in GitHub Desktop.
Regular Expressions in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Regular expression parser and interpreter. | |
One may check whether a string s matches a regular expression r by using the | |
`matches` function. To construct a regular expression object from its compact | |
textual form, use the `parse` function. | |
As a combined example, | |
>>> r = parse("a|b*(c)") | |
>>> matches(r, "bbbc") | |
True | |
A full grammar for regular expressions is defined in the comments below. | |
""" | |
from typing import Tuple, Optional | |
from abc import ABCMeta, abstractmethod | |
# Internal Representation for Regular Expressions | |
class RegEx(metaclass=ABCMeta): | |
@abstractmethod | |
def match(self, string: str) -> Tuple[Optional[str], str]: | |
"""Match the regular expression against a string. | |
On success, this should return the pair (m, r) where | |
m is the matched substring and r is the remaining substring. | |
On failure, return None, string. | |
""" | |
pass | |
@dataclass | |
class Empty(RegEx): | |
def match(self, string): | |
return "", string | |
@dataclass | |
class Primitive(RegEx): | |
char: str | |
def match(self, string): | |
if string and string[0] == self.char: | |
return self.char, string[1:] | |
else: | |
return None, string | |
@dataclass | |
class Choice(RegEx): | |
first: RegEx | |
second: RegEx | |
def match(self, string): | |
m1, s1 = self.first.match(string) | |
if m1 is not None: | |
return m1, s1 | |
else: | |
return self.second.match(string) | |
@dataclass | |
class Sequence(RegEx): | |
first: RegEx | |
second: RegEx | |
def match(self, string): | |
m1, s1 = self.first.match(string) | |
if m1 is not None: | |
m2, s2 = self.second.match(s1) | |
if m2 is not None: | |
return m1 + m2, s2 | |
return None, string | |
@dataclass | |
class Repetition(RegEx): | |
internal: RegEx | |
def match(self, string): | |
matched = None | |
while True: | |
m, s = self.internal.match(string) | |
if m is not None: | |
matched = m if matched is None else matched + m | |
string = s | |
else: | |
return matched, string | |
def matches(regex: RegEx, string: str) -> bool: | |
m, _ = regex.match(string) | |
return m is not None | |
# EBNF grammar for regular expressions | |
# | |
# <regex> ::= <term> '|' <regex> ; Choice | |
# | <term> | |
# <term> ::= { <factor> } ; Sequence | |
# <factor> ::= <base> { '*' } ; Repetition | |
# <base> ::= <char> ; Primitive | |
# | '\' <char> ; Primitive | |
# | '(' <regex> ')' | |
input = "a|b|(c*)" | |
# Scanner primitives | |
# Here a `token` is represented as a string of length 1. | |
def peek() -> str: | |
"""Get the next token, leaving the input buffer unmodified.""" | |
return input[0] | |
def next() -> str: | |
"""Get the next token, removing it from the input buffer.""" | |
val = peek() | |
eat(val) | |
return val | |
def eat(tok: str): | |
"""Consume a specific token from the input buffer.""" | |
global input | |
if peek() == tok: | |
input = input[1:] | |
else: | |
raise ValueError(f"expected {tok} but instead got {peek()}") | |
def more() -> bool: | |
"""Do we have any more items in the input?""" | |
return len(input) > 0 | |
# Recursive Descent Parser | |
# Each production rule of the above grammar corresponds to a top | |
# level function. | |
def regex() -> RegEx: | |
t = term() | |
if more() and peek() == "|": | |
eat("|") | |
return Choice(t, regex()) | |
else: | |
return t | |
def term() -> RegEx: | |
f = Empty() | |
while more() and peek() != "|" and peek() != ")": | |
if isinstance(f, Empty): | |
f = factor() | |
else: | |
f = Sequence(f, factor()) | |
return f | |
def factor() -> RegEx: | |
b = base() | |
while more() and peek() == "*": | |
eat("*") | |
b = Repetition(b) | |
return b | |
def base() -> RegEx: | |
p = peek() | |
if p == "(": | |
eat("(") | |
r = regex() | |
eat(")") | |
return r | |
elif p == "\\": | |
eat("\\") | |
return Primitive(next()) | |
else: | |
return Primitive(next()) | |
def parse(string) -> RegEx: | |
global input | |
input = string | |
return regex() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment