Last active
April 10, 2018 07:02
-
-
Save drslump/55752165288b7639b06f60888e22c759 to your computer and use it in GitHub Desktop.
lexer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, types, dis, struct | |
BINARY_SUBSCR = lambda: Op('BINARY_SUBSCR') | |
BUILD_TUPLE = lambda x: Op('BUILD_TUPLE', x) | |
COMPARE_OP = lambda x: Op('COMPARE_OP', x) | |
INPLACE_ADD = lambda: Op('INPLACE_ADD') | |
INPLACE_SUBTRACT = lambda: Op('INPLACE_SUBTRACT') | |
JUMP_ABSOLUTE = lambda x: Op('JUMP_ABSOLUTE', x) | |
LOAD_CONST = lambda x: Op('LOAD_CONST', x) | |
LOAD_FAST = lambda x: Op('LOAD_FAST', x) | |
POP_JUMP_IF_TRUE = lambda x: Op('POP_JUMP_IF_TRUE', x) | |
STORE_FAST = lambda x: Op('STORE_FAST', x) | |
RETURN_VALUE = lambda: Op('RETURN_VALUE') | |
STATE = lambda x: Op('$STATE', x) | |
if sys.version_info < (3,6): | |
def pack_opcode(opcode, arg=None): | |
if opcode >= dis.HAVE_ARGUMENT: | |
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits' | |
return struct.pack('<BH', opcode, arg) | |
else: | |
return struct.pack('B', opcode) | |
def patch_arg(buffer, offset, arg): | |
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits' | |
packed = struct.pack('<H', arg) | |
buffer[offset + 1] = packed[0] | |
buffer[offset + 2] = packed[1] | |
else: | |
def pack_opcode(opcode, arg=None): | |
if opcode >= dis.HAVE_ARGUMENT: | |
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits' | |
if arg <= 0xFF: | |
return struct.pack('BB', opcode, arg) | |
else: | |
return struct.pack('BBBB', dis.EXTENDED_ARG, arg>>8, opcode, arg&0xFF) | |
else: | |
return struct.pack('BB', opcode, 0) | |
def patch_arg(buffer, offset, arg): | |
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits' | |
# TODO: Parse opcode and adapt extended arg if needed | |
if False and arg <= 0xFF: | |
buffer[offset+1] = arg | |
else: | |
buffer[offset+1] = arg >> 8 | |
buffer[offset+3] = arg & 0xFF | |
class Op(object): | |
__slots__ = ('opcode', 'arg') | |
def __init__(self, opcode, arg=None): | |
self.opcode = opcode | |
self.arg = arg | |
def __iter__(self): | |
return iter((self.opcode, self.arg)) | |
class Ops(list): | |
def __lshift__(self, other): | |
if isinstance(other, (tuple, list)): | |
self.extend(other) | |
elif isinstance(other, Op): | |
self.append(other) | |
elif isinstance(other, Abstract): | |
self.extend(other.opcodes()) | |
else: | |
raise TypeError('Unsupported type: ' + type(other)) | |
class Abstract(object): | |
def opcodes(self): | |
raise AssertionError('Not implemented') | |
def optimize(self): | |
pass | |
def _encode(self, constnames, varnames): | |
""" | |
Note that constnames and varnames will be muteted | |
""" | |
code = bytearray() | |
states = {} | |
jumps = [] | |
# First encode while keeping a registry of jumps and labels | |
for opcode, arg in self.opcodes(): | |
if opcode == '$STATE': | |
states[arg] = len(code) | |
continue | |
if opcode == 'LOAD_CONST': | |
try: | |
arg = constnames.index(arg) | |
except ValueError: | |
constnames.append(arg) | |
arg = len(constnames) - 1 | |
if opcode in ('STORE_FAST', 'LOAD_FAST'): | |
try: | |
arg = varnames.index(arg) | |
except ValueError: | |
varnames.append(arg) | |
arg = len(varnames) - 1 | |
if opcode == 'COMPARE_OP': | |
arg = dis.cmp_op.index(arg) | |
opcode = dis.opmap[opcode] | |
if opcode in dis.hasjabs and not isinstance(arg, int): | |
jumps.append((arg, len(code))) | |
arg = 65535 # force a extended_arg on >3.6 | |
code.extend(pack_opcode(opcode, arg)) | |
# Now process the jumps to set the correct offsets | |
for state, offset in jumps: | |
patch_arg(code, offset, states[state]) | |
return bytes(code) | |
def compile(self, name='fsmlex', docblock=None): | |
""" Builds a function with the currently configured opcodes | |
""" | |
argnames = ('stream', 'ofs') | |
varnames = list(argnames) | |
constnames = [docblock] | |
code = self._encode(constnames, varnames) | |
args = [ | |
len(argnames), # co_argcount -> (stream, ofs) | |
len(varnames), # co_nlocals | |
2, # co_stacksize -> maximum number of values in the stack | |
0, # co_flags -> only if *args is used | |
code, # co_code -> compiled bytecode | |
tuple(constnames), # co_consts -> literals in the code (first is docblock) | |
(), # co_names -> ??? only used for closures? | |
tuple(varnames), # co_varnames -> list of local variables (starting with args) | |
name + '.py', # co_filename, | |
name, # co_name, | |
0, # co_firstlineno, | |
bytes() # co_lnotab | |
] | |
if sys.version_info >= (3,0,0): | |
args.insert(1, 0) # co_kwonlyargcount | |
co = types.CodeType(*args) | |
return types.FunctionType(co, {}, name, (0,)) | |
class State(Abstract): | |
""" Holds the set of actions for a label | |
""" | |
__slots__ = ('label', 'actions') | |
def __init__(self, label=None, actions=None): | |
self.label = str(label) if label is not None else None | |
self.actions = actions if actions else [] | |
def add(self, *actions): | |
self.actions.extend(actions) | |
def optimize(self): | |
# TODO: Collapse Matches with same target | |
for action in self.actions: | |
action.optimize() | |
def opcodes(self): | |
ops = Ops() | |
if self.label: | |
ops << STATE(self.label) | |
for action in self.actions: | |
ops << action | |
return ops | |
class Match(Abstract): | |
""" Jumps if the character matches the set of values | |
""" | |
__slots__ = ('value', 'label') | |
def __init__(self, values, label): | |
self.values = values | |
self.label = str(label) | |
def opcodes(self): | |
ops = Ops() | |
# > if ch in self.values | |
ops << LOAD_FAST('ch') | |
ops << LOAD_CONST(u''.join(self.values)) | |
if 1 == len(self.values): | |
ops << COMPARE_OP('==') | |
else: | |
ops << COMPARE_OP('in') | |
ops << POP_JUMP_IF_TRUE(self.label) | |
return ops | |
class Jump(Abstract): | |
""" Jumps to a specific label | |
""" | |
__slots__ = ('label',) | |
def __init__(self, label): | |
self.label = str(label) | |
def opcodes(self): | |
ops = Ops() | |
ops << JUMP_ABSOLUTE(self.label) | |
return ops | |
class Consume(Abstract): | |
""" Consumes the next character from the stream | |
""" | |
__slots__ = ('advance',) | |
def __init__(self, advance=True): | |
self.advance = advance | |
def opcodes(self): | |
ops = Ops() | |
if self.advance: | |
ops << Advance() | |
ops << LOAD_FAST('stream') | |
ops << LOAD_FAST('ofs') | |
ops << BINARY_SUBSCR() # > stream[ofs] | |
ops << STORE_FAST('ch') # > ch = stream[ofs] | |
return ops | |
class Advance(Abstract): | |
""" Advances to the next character in the stream | |
""" | |
__slots__ = () | |
def opcodes(self): | |
ops = Ops() | |
ops << LOAD_FAST('ofs') | |
ops << LOAD_CONST(1) | |
ops << INPLACE_ADD() # > ofs + 1 | |
ops << STORE_FAST('ofs') # > ofs = ofs + 1 | |
return ops | |
class Marker(Abstract): | |
""" Marks the current offset for a look ahead | |
""" | |
__slots__ = ('mark',) | |
def __init__(self, mark): | |
self.mark = mark | |
def opcodes(self): | |
ops = Ops() | |
ops << LOAD_CONST(self.mark) | |
ops << STORE_FAST('accept') # > accept = self.mark | |
ops << LOAD_FAST('ofs') | |
ops << STORE_FAST('marker') # > marker = ofs | |
return ops | |
class Backtrack(Abstract): | |
""" Backtracks a look ahead | |
""" | |
__slots__ = () | |
def opcodes(self): | |
ops = Ops() | |
# > ofs = marker | |
ops << LOAD_FAST('marker') | |
ops << STORE_FAST('ofs') | |
return ops | |
class Accept(Abstract): | |
""" Accepts a look ahead | |
""" | |
__slots__ = ('mark', 'label') | |
def __init__(self, mark, label): | |
self.mark = mark | |
self.label = str(label) | |
def opcodes(self): | |
ops = Ops() | |
# > if accept == self.mark | |
ops << LOAD_FAST('accept') | |
ops << LOAD_CONST(self.mark) | |
ops << COMPARE_OP('==') | |
ops << POP_JUMP_IF_TRUE(self.label) | |
return ops | |
class Produce(Abstract): | |
""" Returns the current offset with an optional token | |
""" | |
__slots__ = ('token',) | |
def __init__(self, token=None): | |
self.token = token | |
def opcodes(self): | |
ops = Ops() | |
# > return (ofs, self.token) | |
ops << LOAD_FAST('ofs') | |
ops << LOAD_CONST(self.token) | |
ops << BUILD_TUPLE(2) | |
ops << RETURN_VALUE() | |
return ops |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fsm import Accept, Advance, Backtrack, Consume, Jump, Match, Marker, Produce, State | |
def factory_grammar(): | |
fsm = State(None, [ | |
Consume(False), | |
Match(' \t', 'yy4'), | |
Match('\n', 'yy7'), | |
Match('\r', 'yy10'), | |
Match('!', 'yy11'), | |
Match('"', 'yy12'), | |
Match('%', 'yy13'), | |
Match('(', 'yy14'), | |
Match(')', 'yy16'), | |
Match('*', 'yy18'), | |
Match('+', 'yy18'), | |
Match('-', 'yy20'), | |
Match('.', 'yy21'), | |
Match('/', 'yy23'), | |
Match(':', 'yy24'), | |
Match('?', 'yy26'), | |
Match('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'yy27'), | |
Match('[', 'yy30'), | |
Match(']', 'yy32'), | |
Match('_', 'yy34'), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy27'), | |
Match('|', 'yy38'), | |
Match('~', 'yy40'), | |
Jump('yy2'), | |
State('yy2', [ | |
Advance(), | |
]), | |
State('yy3', [ | |
Produce(None), | |
]), | |
State('yy4', [ | |
Consume(), | |
Match(' \t', 'yy4'), | |
Jump('yy6') | |
]), | |
State('yy6', [ | |
Produce('WS'), | |
]), | |
State('yy7', [ | |
Consume(), | |
Marker(0), | |
Match(' \t', 'yy42'), | |
Match('\n', 'yy7'), | |
Match('\r', 'yy44'), | |
Jump('yy9') | |
]), | |
State('yy9', [ | |
Produce('_NL'), | |
]), | |
State('yy10', [ | |
Consume(), | |
Match('\n', 'yy7'), | |
Jump('yy3'), | |
]), | |
State('yy11', [ | |
Consume(), | |
Marker(1), | |
Match('?_', 'yy46'), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'), | |
Jump('yy3'), | |
]), | |
State('yy12', [ | |
Consume(), | |
Marker(1), | |
Match('\n', 'yy3'), | |
Jump('yy48'), | |
]), | |
State('yy13', [ | |
Consume(), | |
Marker(1), | |
Match('i', 'yy53'), | |
Jump('yy3'), | |
]), | |
State('yy14', [ | |
Advance(), | |
Produce('_LPAR'), | |
]), | |
State('yy16', [ | |
Advance(), | |
Produce('_RPAR'), | |
]), | |
State('yy18', [ | |
Consume(), | |
Match('?', 'yy54'), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy55'), | |
Jump('yy19'), | |
]), | |
State('yy19', [ | |
Produce('OP'), | |
]), | |
State('yy20', [ | |
Consume(), | |
Match('>', 'yy57'), | |
Jump('yy3'), | |
]), | |
State('yy21', [ | |
Advance(), | |
Produce('_DOT'), | |
]), | |
State('yy23', [ | |
Consume(), | |
Marker(1), | |
Match('/', 'yy61'), | |
Jump('yy59'), | |
]), | |
State('yy24', [ | |
Advance(), | |
Produce('_COLON'), | |
]), | |
State('yy26', [ | |
Consume(), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'), | |
Jump('yy62'), | |
]), | |
State('yy27', [ | |
Consume(), | |
Match('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', 'yy27'), | |
Jump('yy29'), | |
]), | |
State('yy29', [ | |
Produce('TOKEN'), | |
]), | |
State('yy30', [ | |
Advance(), | |
Produce('_LBRA'), | |
]), | |
State('yy32', [ | |
Advance(), | |
Produce('_RBRA'), | |
]), | |
State('yy34', [ | |
Consume(), | |
Match('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'yy27'), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'), | |
Jump('yy3'), | |
]), | |
State('yy35', [ | |
Consume(), | |
Match('0123456789_abcdefghijklmnopqrstuvwxyz', 'yy35'), | |
Jump('yy37'), | |
]), | |
State('yy37', [ | |
Produce('RULE'), | |
]), | |
State('yy38', [ | |
Advance(), | |
Produce('_OR'), | |
]), | |
State('yy40', [ | |
Advance(), | |
Produce('_TILDE'), | |
]), | |
State('yy42', [ | |
Consume(), | |
Match(' \t', 'yy42'), | |
Jump('yy9'), | |
]), | |
State('yy44', [ | |
Consume(), | |
Match('\n', 'yy7'), | |
Jump('yy45'), | |
]), | |
State('yy45', [ | |
Backtrack(), | |
Accept(0, 'yy9'), | |
Accept(1, 'yy3'), | |
Accept(2, 'yy50'), | |
Jump('yy69'), | |
]), | |
State('yy46', [ | |
Consume(), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'), | |
Jump('yy45'), | |
]), | |
State('yy47', [ | |
Consume() | |
]), | |
State('yy48', [ | |
Match('\n', 'yy45'), | |
Match('"', 'yy49'), | |
Match('\\', 'yy51'), | |
Jump('yy47'), | |
]), | |
State('yy49', [ | |
Consume(), | |
Match('i', 'yy63'), | |
Jump('yy50'), | |
]), | |
State('yy50', [ | |
Produce('STRING'), | |
]), | |
State('yy51', [ | |
Consume(), | |
Match('\n', 'yy45'), | |
Match('"', 'yy64'), | |
Match('\\', 'yy51'), | |
Jump('yy47'), | |
]), | |
State('yy53', [ | |
Consume(), | |
Match('g', 'yy65'), | |
Match('m', 'yy66'), | |
Jump('yy45'), | |
]), | |
State('yy54', [ | |
Consume(), | |
Match('abcdefghijklmnopqrstuvwxyz', 'yy55'), | |
Jump('yy19'), | |
]), | |
State('yy55', [ | |
Advance(), | |
Produce('OP'), | |
]), | |
State('yy57', [ | |
Advance(), | |
Produce('_TO'), | |
]), | |
State('yy59', [ | |
Consume(), | |
Match('\n', 'yy45'), | |
Match('/', 'yy67'), | |
Match('\\', 'yy70'), | |
Jump('yy59'), | |
]), | |
State('yy61', [ | |
Consume(), | |
Match('\n', 'yy45'), | |
Jump('yy72'), | |
]), | |
State('yy62', [ | |
Advance(), | |
Jump('yy19'), | |
]), | |
State('yy63', [ | |
Advance(), | |
Jump('yy50'), | |
]), | |
State('yy64', [ | |
Consume(), | |
Marker(2), | |
Match('\n', 'yy50'), | |
Match('"', 'yy49'), | |
Match('\\', 'yy51'), | |
Match('i', 'yy74'), | |
Jump('yy47'), | |
]), | |
State('yy65', [ | |
Consume(), | |
Match('n', 'yy75'), | |
Jump('yy45'), | |
]), | |
State('yy66', [ | |
Consume(), | |
Match('p', 'yy76'), | |
Jump('yy45'), | |
]), | |
State('yy67', [ | |
Consume(), | |
Match('ilmsux', 'yy67'), | |
Jump('yy69'), | |
]), | |
State('yy69', [ | |
Produce('REGEXP'), | |
]), | |
State('yy70', [ | |
Consume(), | |
Match('\n', 'yy45'), | |
Match('/', 'yy77'), | |
Match('\\', 'yy70'), | |
Jump('yy59'), | |
]), | |
State('yy72', [ | |
Produce('COMMENT'), | |
]), | |
State('yy74', [ | |
Consume(), | |
Marker(2), | |
Match('\n', 'yy50'), | |
Match('"', 'yy49'), | |
Match('\\', 'yy51'), | |
Jump('yy47'), | |
]), | |
State('yy75', [ | |
Consume(), | |
Match('o', 'yy79'), | |
Jump('yy45'), | |
]), | |
State('yy76', [ | |
Consume(), | |
Match('o', 'yy80'), | |
Jump('yy45'), | |
]), | |
State('yy77', [ | |
Consume(), | |
Marker(3), | |
Match('\n', 'yy69'), | |
Match('/', 'yy67'), | |
Match('\\', 'yy70'), | |
Match('ilmsux', 'yy77'), | |
Jump('yy59'), | |
]), | |
State('yy79', [ | |
Consume(), | |
Match('r', 'yy81'), | |
Jump('yy45'), | |
]), | |
State('yy80', [ | |
Consume(), | |
Match('r', 'yy82'), | |
Jump('yy45'), | |
]), | |
State('yy81', [ | |
Consume(), | |
Match('e', 'yy83'), | |
Jump('yy45'), | |
]), | |
State('yy82', [ | |
Consume(), | |
Match('t', 'yy85'), | |
Jump('yy45'), | |
]), | |
State('yy83', [ | |
Advance(), | |
Produce('_IGNORE'), | |
]), | |
State('yy85', [ | |
Advance(), | |
Produce('_IMPORT') | |
]), | |
]) | |
return fsm.compile() | |
lex = factory_grammar() | |
def genlex(stream): | |
ofs = 0 | |
length = len(stream) | |
while ofs < length: | |
try: | |
pos, token = lex(stream, ofs) | |
yield ofs, token, stream[ofs:pos] | |
except IndexError: | |
break | |
ofs = pos | |
with open('lark/grammars/common.g') as fd: | |
lines = [] | |
for line in fd: | |
if not line.startswith('//'): | |
lines.append(line) | |
stream = u''.join(lines) * 100 | |
def lexit(): | |
for ofs, token, value in genlex(stream): | |
pass | |
from timeit import timeit | |
print(timeit(lexit, number=100)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Generated by re2c 1.0.3 on Tue Apr 10 09:00:46 2018 */ | |
#line 1 "/Users/drslump/tmp/test.re2c" | |
static bool lex(const char *s, unsigned long &u) | |
{ | |
const char *YYMARKER; | |
const char *YYCTXMARKER; | |
int c = yycinit; | |
u = 0; | |
#line 17 "grammar.cc" | |
{ | |
char yych; | |
unsigned int yyaccept = 0; | |
yych = *s; | |
switch (yych) { | |
case '\t': | |
case ' ': goto yy4; | |
case '\n': goto yy7; | |
case '\r': goto yy10; | |
case '!': goto yy11; | |
case '"': goto yy12; | |
case '%': goto yy13; | |
case '(': goto yy14; | |
case ')': goto yy16; | |
case '*': | |
case '+': goto yy18; | |
case '-': goto yy20; | |
case '.': goto yy21; | |
case '/': goto yy23; | |
case ':': goto yy24; | |
case '?': goto yy26; | |
case 'A': | |
case 'B': | |
case 'C': | |
case 'D': | |
case 'E': | |
case 'F': | |
case 'G': | |
case 'H': | |
case 'I': | |
case 'J': | |
case 'K': | |
case 'L': | |
case 'M': | |
case 'N': | |
case 'O': | |
case 'P': | |
case 'Q': | |
case 'R': | |
case 'S': | |
case 'T': | |
case 'U': | |
case 'V': | |
case 'W': | |
case 'X': | |
case 'Y': | |
case 'Z': goto yy27; | |
case '[': goto yy30; | |
case ']': goto yy32; | |
case '_': goto yy34; | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
case '|': goto yy38; | |
case '~': goto yy40; | |
default: goto yy2; | |
} | |
yy2: | |
++s; | |
yy3: | |
#line 40 "/Users/drslump/tmp/test.re2c" | |
{ return false; } | |
#line 103 "grammar.cc" | |
yy4: | |
yych = *++s; | |
switch (yych) { | |
case '\t': | |
case ' ': goto yy4; | |
default: goto yy6; | |
} | |
yy6: | |
#line 25 "/Users/drslump/tmp/test.re2c" | |
{ "WS" } | |
#line 114 "grammar.cc" | |
yy7: | |
yyaccept = 0; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '\t': | |
case ' ': goto yy42; | |
case '\n': goto yy7; | |
case '\r': goto yy44; | |
default: goto yy9; | |
} | |
yy9: | |
#line 19 "/Users/drslump/tmp/test.re2c" | |
{ "NL" } | |
#line 128 "grammar.cc" | |
yy10: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy7; | |
default: goto yy3; | |
} | |
yy11: | |
yyaccept = 1; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '?': | |
case '_': goto yy46; | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
default: goto yy3; | |
} | |
yy12: | |
yyaccept = 1; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '\n': goto yy3; | |
default: goto yy48; | |
} | |
yy13: | |
yyaccept = 1; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case 'i': goto yy53; | |
default: goto yy3; | |
} | |
yy14: | |
++s; | |
#line 33 "/Users/drslump/tmp/test.re2c" | |
{ "LPAR" } | |
#line 187 "grammar.cc" | |
yy16: | |
++s; | |
#line 36 "/Users/drslump/tmp/test.re2c" | |
{ "RPAR" } | |
#line 192 "grammar.cc" | |
yy18: | |
yych = *++s; | |
switch (yych) { | |
case '?': goto yy54; | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy55; | |
default: goto yy19; | |
} | |
yy19: | |
#line 29 "/Users/drslump/tmp/test.re2c" | |
{ "OP" } | |
#line 228 "grammar.cc" | |
yy20: | |
yych = *++s; | |
switch (yych) { | |
case '>': goto yy57; | |
default: goto yy3; | |
} | |
yy21: | |
++s; | |
#line 31 "/Users/drslump/tmp/test.re2c" | |
{ "DOT" } | |
#line 239 "grammar.cc" | |
yy23: | |
yyaccept = 1; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '/': goto yy61; | |
default: goto yy59; | |
} | |
yy24: | |
++s; | |
#line 38 "/Users/drslump/tmp/test.re2c" | |
{ return "COLON"; } | |
#line 251 "grammar.cc" | |
yy26: | |
yych = *++s; | |
switch (yych) { | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
default: goto yy62; | |
} | |
yy27: | |
yych = *++s; | |
switch (yych) { | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
case 'A': | |
case 'B': | |
case 'C': | |
case 'D': | |
case 'E': | |
case 'F': | |
case 'G': | |
case 'H': | |
case 'I': | |
case 'J': | |
case 'K': | |
case 'L': | |
case 'M': | |
case 'N': | |
case 'O': | |
case 'P': | |
case 'Q': | |
case 'R': | |
case 'S': | |
case 'T': | |
case 'U': | |
case 'V': | |
case 'W': | |
case 'X': | |
case 'Y': | |
case 'Z': | |
case '_': goto yy27; | |
default: goto yy29; | |
} | |
yy29: | |
#line 23 "/Users/drslump/tmp/test.re2c" | |
{ "TOKEN" } | |
#line 328 "grammar.cc" | |
yy30: | |
++s; | |
#line 32 "/Users/drslump/tmp/test.re2c" | |
{ "LBRA" } | |
#line 333 "grammar.cc" | |
yy32: | |
++s; | |
#line 35 "/Users/drslump/tmp/test.re2c" | |
{ "RBRA" } | |
#line 338 "grammar.cc" | |
yy34: | |
yych = *++s; | |
switch (yych) { | |
case 'A': | |
case 'B': | |
case 'C': | |
case 'D': | |
case 'E': | |
case 'F': | |
case 'G': | |
case 'H': | |
case 'I': | |
case 'J': | |
case 'K': | |
case 'L': | |
case 'M': | |
case 'N': | |
case 'O': | |
case 'P': | |
case 'Q': | |
case 'R': | |
case 'S': | |
case 'T': | |
case 'U': | |
case 'V': | |
case 'W': | |
case 'X': | |
case 'Y': | |
case 'Z': goto yy27; | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
default: goto yy3; | |
} | |
yy35: | |
yych = *++s; | |
switch (yych) { | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
case '_': | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
default: goto yy37; | |
} | |
yy37: | |
#line 22 "/Users/drslump/tmp/test.re2c" | |
{ "RULE" } | |
#line 441 "grammar.cc" | |
yy38: | |
++s; | |
#line 34 "/Users/drslump/tmp/test.re2c" | |
{ "OR" } | |
#line 446 "grammar.cc" | |
yy40: | |
++s; | |
#line 37 "/Users/drslump/tmp/test.re2c" | |
{ "TILDE" } | |
#line 451 "grammar.cc" | |
yy42: | |
yych = *++s; | |
switch (yych) { | |
case '\t': | |
case ' ': goto yy42; | |
default: goto yy9; | |
} | |
yy44: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy7; | |
default: goto yy45; | |
} | |
yy45: | |
s = YYMARKER; | |
switch (yyaccept) { | |
case 0: goto yy9; | |
case 1: goto yy3; | |
case 2: goto yy50; | |
default: goto yy69; | |
} | |
yy46: | |
yych = *++s; | |
switch (yych) { | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy35; | |
default: goto yy45; | |
} | |
yy47: | |
yych = *++s; | |
yy48: | |
switch (yych) { | |
case '\n': goto yy45; | |
case '"': goto yy49; | |
case '\\': goto yy51; | |
default: goto yy47; | |
} | |
yy49: | |
yych = *++s; | |
switch (yych) { | |
case 'i': goto yy63; | |
default: goto yy50; | |
} | |
yy50: | |
#line 21 "/Users/drslump/tmp/test.re2c" | |
{ "STRING" } | |
#line 522 "grammar.cc" | |
yy51: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy45; | |
case '"': goto yy64; | |
case '\\': goto yy51; | |
default: goto yy47; | |
} | |
yy53: | |
yych = *++s; | |
switch (yych) { | |
case 'g': goto yy65; | |
case 'm': goto yy66; | |
default: goto yy45; | |
} | |
yy54: | |
yych = *++s; | |
switch (yych) { | |
case 'a': | |
case 'b': | |
case 'c': | |
case 'd': | |
case 'e': | |
case 'f': | |
case 'g': | |
case 'h': | |
case 'i': | |
case 'j': | |
case 'k': | |
case 'l': | |
case 'm': | |
case 'n': | |
case 'o': | |
case 'p': | |
case 'q': | |
case 'r': | |
case 's': | |
case 't': | |
case 'u': | |
case 'v': | |
case 'w': | |
case 'x': | |
case 'y': | |
case 'z': goto yy55; | |
default: goto yy19; | |
} | |
yy55: | |
++s; | |
s -= 1; | |
#line 39 "/Users/drslump/tmp/test.re2c" | |
{ return "OP"; } | |
#line 574 "grammar.cc" | |
yy57: | |
++s; | |
#line 30 "/Users/drslump/tmp/test.re2c" | |
{ "TO" } | |
#line 579 "grammar.cc" | |
yy59: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy45; | |
case '/': goto yy67; | |
case '\\': goto yy70; | |
default: goto yy59; | |
} | |
yy61: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy45; | |
default: goto yy72; | |
} | |
yy62: | |
++s; | |
goto yy19; | |
yy63: | |
++s; | |
goto yy50; | |
yy64: | |
yyaccept = 2; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '\n': goto yy50; | |
case '"': goto yy49; | |
case '\\': goto yy51; | |
case 'i': goto yy75; | |
default: goto yy47; | |
} | |
yy65: | |
yych = *++s; | |
switch (yych) { | |
case 'n': goto yy76; | |
default: goto yy45; | |
} | |
yy66: | |
yych = *++s; | |
switch (yych) { | |
case 'p': goto yy77; | |
default: goto yy45; | |
} | |
yy67: | |
yych = *++s; | |
switch (yych) { | |
case 'i': | |
case 'l': | |
case 'm': | |
case 's': | |
case 'u': | |
case 'x': goto yy67; | |
default: goto yy69; | |
} | |
yy69: | |
#line 20 "/Users/drslump/tmp/test.re2c" | |
{ "REGEXP" } | |
#line 636 "grammar.cc" | |
yy70: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy45; | |
case '/': goto yy78; | |
case '\\': goto yy70; | |
default: goto yy59; | |
} | |
yy72: | |
yych = *++s; | |
switch (yych) { | |
case '\n': goto yy74; | |
default: goto yy72; | |
} | |
yy74: | |
#line 24 "/Users/drslump/tmp/test.re2c" | |
{ "COMMENT" } | |
#line 654 "grammar.cc" | |
yy75: | |
yyaccept = 2; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '\n': goto yy50; | |
case '"': goto yy49; | |
case '\\': goto yy51; | |
default: goto yy47; | |
} | |
yy76: | |
yych = *++s; | |
switch (yych) { | |
case 'o': goto yy80; | |
default: goto yy45; | |
} | |
yy77: | |
yych = *++s; | |
switch (yych) { | |
case 'o': goto yy81; | |
default: goto yy45; | |
} | |
yy78: | |
yyaccept = 3; | |
yych = *(YYMARKER = ++s); | |
switch (yych) { | |
case '\n': goto yy69; | |
case '/': goto yy67; | |
case '\\': goto yy70; | |
case 'i': | |
case 'l': | |
case 'm': | |
case 's': | |
case 'u': | |
case 'x': goto yy78; | |
default: goto yy59; | |
} | |
yy80: | |
yych = *++s; | |
switch (yych) { | |
case 'r': goto yy82; | |
default: goto yy45; | |
} | |
yy81: | |
yych = *++s; | |
switch (yych) { | |
case 'r': goto yy83; | |
default: goto yy45; | |
} | |
yy82: | |
yych = *++s; | |
switch (yych) { | |
case 'e': goto yy84; | |
default: goto yy45; | |
} | |
yy83: | |
yych = *++s; | |
switch (yych) { | |
case 't': goto yy86; | |
default: goto yy45; | |
} | |
yy84: | |
++s; | |
#line 27 "/Users/drslump/tmp/test.re2c" | |
{ "IGNORE" } | |
#line 719 "grammar.cc" | |
yy86: | |
++s; | |
#line 28 "/Users/drslump/tmp/test.re2c" | |
{ "IMPORT" } | |
#line 724 "grammar.cc" | |
} | |
#line 42 "/Users/drslump/tmp/test.re2c" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
rex = re.compile(r''' | |
(?P<_NL>(\r?\n)+\s*) | |
|(?P<STRING>"(\\"|\\\\|[^"\n])*?"i?) | |
|(?P<RULE>!?[_?]?[a-z][_a-z0-9]*) | |
|(?P<TOKEN>_?[A-Z][_A-Z0-9]*) | |
|(?P<COMMENT>//[^\n]*) | |
|(?P<WS>[ \t]+) | |
|(?P<NUMBER>\d+) | |
|(?P<_IGNORE>%ignore) | |
|(?P<_IMPORT>%import) | |
|(?P<OP>[+*][?]?|[?](?![a-z])) | |
|(?P<_TO>->) | |
|(?P<_DOT>\.) | |
|(?P<_LBRA>\[) | |
|(?P<_LPAR>\() | |
|(?P<_OR>\|) | |
|(?P<_RBRA>\]) | |
|(?P<_RPAR>\)) | |
|(?P<TILDE>~) | |
|(?P<_COLON>:) | |
''', re.X | re.U) | |
types = {1: u'_NL', 3: u'STRING', 5: u'RULE', 6: u'TOKEN', 7: u'COMMENT', 8: u'WS', 9: u'NUMBER', | |
10: u'_IGNORE', 11: u'_IMPORT', 12: u'OP', 13: u'_TO', 14: u'_DOT', 15: u'_LBRA', 16: u'_LPAR', | |
17: u'_OR', 18: u'_RBRA', 19: u'_RPAR', 20: u'TILDE', 21: u'_COLON'} | |
def relex(stream): | |
ofs = 0 | |
while ofs < len(stream): | |
m = rex.match(stream, ofs) | |
if m: | |
token = types[m.lastindex] | |
end = m.end() | |
value = m.group(0) | |
yield ofs, token, value | |
ofs = end | |
continue | |
ofs += 1 | |
with open('lark/grammars/common.g') as fd: | |
lines = [] | |
for line in fd: | |
if not line.startswith('//'): | |
lines.append(line) | |
data = u''.join(lines) * 100 | |
def lexit(): | |
for pos, token, value in relex(data): | |
pass | |
from timeit import timeit | |
print(timeit(lexit, number=100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment