Created
August 31, 2011 09:36
-
-
Save jdahlin/1183175 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
impore re | |
import sys | |
IDENTIFIER = "IDENTIFIER" | |
INTEGER = "INTEGER" | |
STRING = "STRING" | |
COMMENT = "COMMENT" | |
DOT = "DOT" | |
COLON = "COLON" | |
SEMICOLON = "SEMICOLON" | |
OPEN_BLOCK = "OPEN_BLOCK" | |
CLOSE_BLOCK = "CLOSE_BLOCK" | |
NEWLINE = "NEWLINE" | |
WHITESPACE = "WHITESPACE" | |
token_pattern = r""" | |
(?P<IDENTIFIER>[a-zA-Z_][a-zA-Z0-9_]*) | |
|(?P<INTEGER>[0-9]+) | |
|(?P<STRING>([uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"| | |
[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*')) | |
|(?P<COMMENT>//[^\r\n]*) | |
|(?P<DOT>\.) | |
|(?P<COLON>:) | |
|(?P<SEMICOLON>;) | |
|(?P<OPEN_BLOCK>[{]) | |
|(?P<CLOSE_BLOCK>[}]) | |
|(?P<NEWLINE>\n) | |
|(?P<WHITESPACE>\s+) | |
""" | |
token_re = re.compile(token_pattern, re.VERBOSE) | |
class Token(object): | |
def __init__(self, value, kind, line, start, end): | |
self.value = value | |
self.kind = kind | |
self.line = line | |
self.start = start | |
self.end = end | |
def __repr__(self): | |
return '<Token %s %r>' % (self.kind, self.value) | |
class TokenizerException(Exception): | |
pass | |
def tokenize(text): | |
pos = 0 | |
line = 0 | |
line_start = 0 | |
for match in token_re.finditer(text): | |
pos = match.end() | |
tokkind = match.lastgroup | |
tokvalue = match.group(tokkind) | |
end = match.start() - line_start | |
start = match.end() - line_start | |
yield Token(tokvalue, tokkind, line, end, start) | |
if tokkind == 'newline': | |
line += 1 | |
line_start = pos | |
elif tokkind == 'comment': | |
line += 1 | |
line_start = pos | |
if pos != len(text): | |
raise TokenizerException('tokenizer stopped at pos %r of %r' % ( | |
pos, len(text))) | |
for token in tokenize(open(sys.argv[1]).read()): | |
print token |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment