-
-
Save DrGo/b4f9480c78db071bea3edbf6aa2cb2d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Package lex implements the lexical scanner for Suneido | |
package lex | |
import ( | |
"bytes" | |
"strings" | |
"unicode" | |
"unicode/utf8" | |
) | |
type lexer struct { | |
src string | |
si int | |
start int | |
width int | |
value string | |
} | |
type Item struct { | |
token Token | |
keyword Token | |
value string | |
} | |
func Lexer(src string) *lexer { | |
return &lexer{src: src} | |
} | |
func (lxr *lexer) Next() Item { | |
token := lxr.next() | |
var value string | |
if lxr.value != "" { | |
value = lxr.value | |
} else { | |
value = lxr.src[lxr.start:lxr.si] | |
} | |
keyword := NIL | |
if token == IDENTIFIER && lxr.peek() != ':' { | |
keyword = Keyword(value) | |
} | |
return Item{token, keyword, value} | |
} | |
func (lxr *lexer) next() Token { | |
lxr.start = lxr.si | |
c := lxr.read() | |
switch c { | |
case eof: | |
return EOF | |
case '#': | |
return HASH | |
case '(': | |
return L_PAREN | |
case ')': | |
return R_PAREN | |
case ',': | |
return COMMA | |
case ';': | |
return SEMICOLON | |
case '?': | |
return Q_MARK | |
case '@': | |
return AT | |
case '[': | |
return L_BRACKET | |
case ']': | |
return R_BRACKET | |
case '{': | |
return L_CURLY | |
case '}': | |
return R_CURLY | |
case '~': | |
return BITNOT | |
case ':': | |
if lxr.match(':') { | |
return RANGELEN | |
} else { | |
return COLON | |
} | |
case '=': | |
if lxr.match('=') { | |
return IS | |
} else { | |
if lxr.match('~') { | |
return MATCH | |
} else { | |
return EQ | |
} | |
} | |
case '!': | |
if lxr.match('=') { | |
return ISNT | |
} else { | |
if lxr.match('~') { | |
return MATCHNOT | |
} else { | |
return NOT | |
} | |
} | |
case '<': | |
if lxr.match('<') { | |
if lxr.match('=') { | |
return LSHIFTEQ | |
} else { | |
return LSHIFT | |
} | |
} else if lxr.match('>') { | |
return ISNT | |
} else if lxr.match('=') { | |
return LTE | |
} else { | |
return LT | |
} | |
case '>': | |
if lxr.match('>') { | |
if lxr.match('=') { | |
return RSHIFTEQ | |
} else { | |
return RSHIFT | |
} | |
} else if lxr.match('=') { | |
return GTE | |
} else { | |
return GT | |
} | |
case '|': | |
if lxr.match('|') { | |
return OR | |
} else if lxr.match('=') { | |
return BITOREQ | |
} else { | |
return BITOR | |
} | |
case '&': | |
if lxr.match('&') { | |
return AND | |
} else if lxr.match('=') { | |
return BITANDEQ | |
} else { | |
return BITAND | |
} | |
case '^': | |
if lxr.match('=') { | |
return BITXOREQ | |
} else { | |
return BITXOR | |
} | |
case '-': | |
if lxr.match('-') { | |
return DEC | |
} else if lxr.match('=') { | |
return SUBEQ | |
} else { | |
return SUB | |
} | |
case '+': | |
if lxr.match('+') { | |
return INC | |
} else if lxr.match('=') { | |
return ADDEQ | |
} else { | |
return ADD | |
} | |
case '/': | |
if lxr.match('/') { | |
return lxr.lineComment() | |
} else if lxr.match('*') { | |
return lxr.spanComment() | |
} else if lxr.match('=') { | |
return DIVEQ | |
} else { | |
return DIV | |
} | |
case '*': | |
if lxr.match('=') { | |
return MULEQ | |
} else { | |
return MUL | |
} | |
case '%': | |
if lxr.match('=') { | |
return MODEQ | |
} else { | |
return MOD | |
} | |
case '$': | |
if lxr.match('=') { | |
return CATEQ | |
} else { | |
return CAT | |
} | |
case '`': | |
return lxr.rawString() | |
case '"': | |
case '\'': | |
return lxr.quotedString(c) | |
case '.': | |
if lxr.match('.') { | |
return RANGETO | |
} else if unicode.IsDigit(lxr.peek()) { | |
return lxr.number() | |
} else { | |
return DOT | |
} | |
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': | |
return lxr.number() | |
default: | |
if isSpace(c) { | |
return lxr.whitespace(c) | |
} else if unicode.IsLetter(c) || c == '_' { | |
return lxr.identifier() | |
} | |
} | |
return ERROR | |
} | |
func (lxr *lexer) whitespace(c rune) Token { | |
result := WHITESPACE | |
for ; isSpace(c); c = lxr.read() { | |
if c == '\n' || c == '\r' { | |
result = NEWLINE | |
} | |
} | |
lxr.backup() | |
return result | |
} | |
func (lxr *lexer) lineComment() Token { | |
for c := lxr.read(); c != eof && c != '\n'; c = lxr.read() { | |
} | |
return COMMENT | |
} | |
func (lxr *lexer) spanComment() Token { | |
lxr.matchUntil(func() bool { return strings.HasSuffix(lxr.src[:lxr.si], "*/") }) | |
return COMMENT | |
} | |
func (lxr *lexer) rawString() Token { | |
for c := lxr.read(); c != eof && c != '`'; c = lxr.read() { | |
} | |
lxr.value = lxr.src[lxr.start+1 : lxr.si-1] | |
return STRING | |
} | |
func (lxr *lexer) quotedString(quote rune) Token { | |
var buf bytes.Buffer | |
lxr.match(quote) | |
for c := lxr.read(); c != eof && c != quote; c = lxr.read() { | |
buf.WriteRune(lxr.doesc(c)) | |
} | |
lxr.value = buf.String() | |
return STRING | |
} | |
func (lxr *lexer) doesc(c rune) rune { | |
if c != '\\' { | |
return c | |
} | |
save := lxr.si | |
c = lxr.read() | |
switch c { | |
case 'n': | |
return '\n' | |
case 't': | |
return '\t' | |
case 'r': | |
return '\r' | |
case 'x': | |
dig1 := digit(lxr.read(), 16) | |
dig2 := digit(lxr.read(), 16) | |
if dig1 != -1 && dig2 != -1 { | |
return rune(16*dig1 + dig2) | |
} | |
case '\\': | |
case '"': | |
case '\'': | |
return c | |
default: | |
dig1 := digit(lxr.read(), 8) | |
dig2 := digit(lxr.read(), 8) | |
dig3 := digit(lxr.read(), 8) | |
if dig1 != -1 && dig2 != -1 && dig3 != -1 { | |
return rune(64*dig1 + 8*dig2 + dig3) | |
} | |
} | |
lxr.si = save | |
return '\\' | |
} | |
func digit(c rune, radix int) int { | |
n := 99 | |
if isDigit(c) { | |
n = int(c - '0') | |
} else if isHexDigit(c) { | |
n = int(10 + unicode.ToLower(c) - 'a') | |
} | |
if n < radix { | |
return n | |
} else { | |
return -1 | |
} | |
} | |
func isDigit(r rune) bool { | |
return '0' <= r && r <= '9' | |
} | |
func isHexDigit(r rune) bool { | |
return strings.ContainsRune(hexDigits, r) | |
} | |
func (lxr *lexer) number() Token { | |
lxr.matchOneOf("+-") | |
// Is it hex? | |
digits := "0123456789" | |
if lxr.match('0') && lxr.matchOneOf("xX") { | |
digits = hexDigits | |
} | |
lxr.matchRunOf(digits) | |
if lxr.match('.') { | |
lxr.matchRunOf(digits) | |
} | |
if lxr.matchOneOf("eE") { | |
lxr.matchOneOf("+-") | |
lxr.matchRunOf("0123456789") | |
} | |
return NUMBER | |
} | |
func (lxr *lexer) identifier() Token { | |
lxr.matchWhile(isIdentChar) | |
if !lxr.match('?') { | |
lxr.match('!') | |
} | |
return IDENTIFIER | |
} | |
const eof = -1 | |
func (lxr *lexer) read() rune { | |
if lxr.si >= len(lxr.src) { | |
lxr.width = 0 | |
return eof | |
} | |
c, w := utf8.DecodeRuneInString(lxr.src[lxr.si:]) | |
lxr.si += w | |
lxr.width = w | |
return c | |
} | |
func (lxr *lexer) backup() { | |
lxr.si -= lxr.width | |
} | |
func (lxr *lexer) peek() rune { | |
c := lxr.read() | |
lxr.backup() | |
return c | |
} | |
func (lxr *lexer) match(c rune) bool { | |
if c == lxr.read() { | |
return true | |
} | |
lxr.backup() | |
return false | |
} | |
func (lxr *lexer) matchOneOf(valid string) bool { | |
if strings.ContainsRune(valid, lxr.read()) { | |
return true | |
} | |
lxr.backup() | |
return false | |
} | |
func (lxr *lexer) matchRunOf(valid string) { | |
for strings.ContainsRune(valid, lxr.read()) { | |
} | |
lxr.backup() | |
} | |
func (lxr *lexer) matchWhile(f func(c rune) bool) { | |
for c := lxr.read(); f(c); c = lxr.read() { | |
} | |
lxr.backup() | |
} | |
func (lxr *lexer) matchUntil(f func() bool) { | |
for c := lxr.read(); c != eof && !f(); c = lxr.read() { | |
} | |
} | |
func isIdentChar(r rune) bool { | |
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) | |
} | |
const hexDigits = "0123456789abcdefABCDEF" | |
func isSpace(c rune) bool { | |
return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package lex | |
func Keyword(s string) Token { | |
return keywords[s] | |
} | |
func (t Token) IsInfix() bool { | |
return infix[t] | |
} | |
func (t Token) String() string { | |
return tostring[t] | |
} | |
var tostring = map[Token]string{ | |
NIL: "NIL", | |
EOF: "EOF", | |
WHITESPACE: "WHITESPACE", | |
COMMENT: "COMMENT", | |
NEWLINE: "NEWLINE", | |
} | |
type Token int | |
const ( | |
NIL Token = iota | |
EOF | |
ERROR | |
IDENTIFIER | |
NUMBER | |
STRING | |
WHITESPACE | |
COMMENT | |
NEWLINE | |
// operators and punctuation | |
HASH | |
COMMA | |
COLON | |
SEMICOLON | |
Q_MARK | |
AT | |
DOT | |
L_PAREN | |
R_PAREN | |
L_BRACKET | |
R_BRACKET | |
L_CURLY | |
R_CURLY | |
IS | |
ISNT | |
MATCH | |
MATCHNOT | |
LT | |
LTE | |
GT | |
GTE | |
NOT | |
INC | |
DEC | |
BITNOT | |
ADD | |
SUB | |
CAT | |
MUL | |
DIV | |
MOD | |
LSHIFT | |
RSHIFT | |
BITOR | |
BITAND | |
BITXOR | |
EQ | |
ADDEQ | |
SUBEQ | |
CATEQ | |
MULEQ | |
DIVEQ | |
MODEQ | |
LSHIFTEQ | |
RSHIFTEQ | |
BITOREQ | |
BITANDEQ | |
BITXOREQ | |
RANGETO | |
RANGELEN | |
// language keywords | |
AND | |
BOOL | |
BREAK | |
BUFFER | |
CALLBACK | |
CASE | |
CATCH | |
CHAR | |
CLASS | |
CONTINUE | |
CREATE | |
DEFAULT | |
DLL | |
DO | |
DOUBLE | |
ELSE | |
FALSE | |
FLOAT | |
FOR | |
FOREVER | |
FUNCTION | |
GDIOBJ | |
HANDLE | |
IF | |
IN | |
INT64 | |
LONG | |
NEW | |
OR | |
RESOURCE | |
RETURN | |
SHORT | |
STRUCT | |
SWITCH | |
SUPER | |
THIS | |
THROW | |
TRUE | |
TRY | |
VOID | |
WHILE | |
// query keywords | |
ALTER | |
AVERAGE | |
CASCADE | |
COUNT | |
DELETE | |
DROP | |
ENSURE | |
EXTEND | |
HISTORY | |
INDEX | |
INSERT | |
INTERSECT | |
INTO | |
JOIN | |
KEY | |
LEFTJOIN | |
LIST | |
MAX | |
MIN | |
MINUS | |
PROJECT | |
REMOVE | |
RENAME | |
REVERSE | |
SET | |
SORT | |
SUMMARIZE | |
SVIEW | |
TIMES | |
TO | |
TOTAL | |
UNION | |
UNIQUE | |
UPDATE | |
UPDATES | |
VIEW | |
WHERE | |
// for AST | |
ARG | |
ASSIGNOP | |
BINARYOP | |
BLOCK | |
CALL | |
DATE | |
FOR_IN | |
MEMBER | |
METHOD | |
OBJECT | |
POSTINCDEC | |
PREINCDEC | |
RECORD | |
RVALUE | |
SELFREF | |
SUBSCRIPT | |
SYMBOL | |
) | |
var keywords = map[string]Token{ | |
"and": AND, | |
"bool": BOOL, | |
"break": BREAK, | |
"buffer": BUFFER, | |
"callback": CALLBACK, | |
"case": CASE, | |
"catch": CATCH, | |
"char": CHAR, | |
"class": CLASS, | |
"continue": CONTINUE, | |
"default": DEFAULT, | |
"dll": DLL, | |
"do": DO, | |
"double": DOUBLE, | |
"else": ELSE, | |
"false": FALSE, | |
"float": FLOAT, | |
"for": FOR, | |
"forever": FOREVER, | |
"function": FUNCTION, | |
"gdiobj": GDIOBJ, | |
"handle": HANDLE, | |
"if": IF, | |
"in": IN, | |
"int64": INT64, | |
"is": IS, | |
"isnt": ISNT, | |
"long": LONG, | |
"new": NEW, | |
"not": NOT, | |
"or": OR, | |
"resource": RESOURCE, | |
"return": RETURN, | |
"short": SHORT, | |
"string": STRING, | |
"struct": STRUCT, | |
"super": SUPER, | |
"switch": SWITCH, | |
"this": THIS, | |
"throw": THROW, | |
"true": TRUE, | |
"try": TRY, | |
"void": VOID, | |
"while": WHILE, | |
"xor": ISNT, | |
} | |
var infix = map[Token]bool{ | |
AND: true, | |
OR: true, | |
Q_MARK: true, | |
MATCH: true, | |
MATCHNOT: true, | |
ADD: true, | |
SUB: true, | |
CAT: true, | |
MUL: true, | |
DIV: true, | |
MOD: true, | |
LSHIFT: true, | |
RSHIFT: true, | |
BITOR: true, | |
BITAND: true, | |
BITXOR: true, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment