Skip to content

Instantly share code, notes, and snippets.

@peter-leonov
Last active April 29, 2020 21:54
Show Gist options
  • Save peter-leonov/6ff4ae3ca9b86af2f4bb0d63ee7fbb41 to your computer and use it in GitHub Desktop.
Save peter-leonov/6ff4ae3ca9b86af2f4bb0d63ee7fbb41 to your computer and use it in GitHub Desktop.
Naive micro JS parser (with parser <-> lexer connection)
foo / 1 + 2 * 3 + 4 ; bar = /rex1/ /* bla-bla */ / /rex2/
// gives this naive AST
({
type: 'program',
body: [
{
type: 'statement',
body: {
type: 'expression',
body: {
type: 'operator',
operator: '+',
left: {
type: 'operator',
operator: '/',
left: { type: 'id', begin: 0, end: 3, name: 'foo' },
right: { type: 'number', begin: 6, end: 7, value: 1 },
begin: 0,
end: 7
},
right: {
type: 'operator',
operator: '+',
left: {
type: 'operator',
operator: '*',
left: { type: 'number', begin: 10, end: 11, value: 2 },
right: { type: 'number', begin: 14, end: 15, value: 3 },
begin: 10,
end: 15
},
right: { type: 'number', begin: 18, end: 19, value: 4 },
begin: 10,
end: 19
},
begin: 0,
end: 19
},
begin: 0,
end: 19
},
begin: 0,
end: 21
},
{
type: 'statement',
body: {
type: 'assignment',
left: { type: 'id', begin: 22, end: 25, name: 'bar' },
right: {
type: 'expression',
body: {
type: 'operator',
operator: '/',
left: { type: 'regexp', value: 'rex1', begin: 29, end: 34 },
right: { type: 'regexp', value: 'rex2', begin: 52, end: 57 },
begin: 29,
end: 57
},
begin: 29,
end: 57
},
begin: 22,
end: 57
},
begin: 22,
end: 57
}
]
})
const program = parse(lex(`foo / *`));

ParseError: unexpected token "*" at 6, expected "number" or "id" or "regexp"

parse program
char "f"
char "o"
char "o"
char " "
token id
parse maybeRead [ 'id' ]
char "/"
char " "
token /
parse tryMul
char "1"
char " "
token number
parse read [ 'number', 'id', 'regexp' ]
char "+"
char " "
token +
parse tryMulOrSum
char "2"
char " "
token number
parse read [ 'number', 'id', 'regexp' ]
char "*"
char " "
token *
parse tryMul
char "3"
char " "
token number
parse read [ 'number', 'id', 'regexp' ]
char "+"
char " "
token +
parse tryMulOrSum
char "4"
char " "
token number
parse read [ 'number', 'id', 'regexp' ]
char ";"
char " "
token ;
parse read [ ';', 'eof' ]
char "b"
char "a"
char "r"
char " "
token id
parse maybeRead [ 'id' ]
char "="
char " "
token =
parse maybeAssignment
char "/"
char "r"
char "e"
char "x"
char "1"
char "/"
char " "
token regexp
parse read [ 'number', 'id', 'regexp' ]
char "/"
char "*"
char " "
char "b"
char "l"
char "a"
char "-"
char "b"
char "l"
char "a"
char " "
char "*"
char "/"
char " "
char "/"
char " "
token /
parse tryMul
char "/"
char "r"
char "e"
char "x"
char "2"
char "/"
char undefined
token regexp
parse read [ 'number', 'id', 'regexp' ]
token eof
parse read [ ';', 'eof' ]
token undefined
done
/*
* TODO next time I'm bored: write AST -> source serializer
* and random tree generator to run a fuzzer.
*/
const CHAR_a = "a".charCodeAt(0);
const CHAR_z = "z".charCodeAt(0);
const CHAR_A = "A".charCodeAt(0);
const CHAR_Z = "Z".charCodeAt(0);
const CHAR_0 = "0".charCodeAt(0);
const CHAR_9 = "9".charCodeAt(0);
class SyntaxError extends Error {}
class ParseError extends Error {}
function* lex(src) {
let p = -1;
let c = "";
let comment;
let mode = "value"; // how to interpret symbols
function next() {
c = src[++p];
console.log("char", JSON.stringify(c));
}
function isAlpha() {
if (!c) return false;
const code = c.charCodeAt(0);
if (CHAR_a <= code && code <= CHAR_z) return true;
if (CHAR_A <= code && code <= CHAR_Z) return true;
if (c === "_") return true;
return false;
}
function isNumber() {
if (!c) return false;
const code = c.charCodeAt(0);
if (CHAR_0 <= code && code <= CHAR_9) return true;
return false;
}
function isWhitespace() {
return c === " " || c === "\t";
}
function eof() {
if (c !== undefined) {
return null;
}
const begin = p;
const end = p;
return {
type: "eof",
begin,
end,
};
}
function whitespace() {
while (isWhitespace()) {
next();
}
}
function semicolon() {
const begin = p;
if (c == ";") {
do {
next();
} while (c == ";");
return {
type: ";",
begin,
end: p,
};
}
}
function id() {
let name = "";
const begin = p;
while (isAlpha()) {
name += c;
next();
}
const end = p;
if (!name) {
return null;
}
return {
type: "id",
begin,
end,
name,
};
}
function number() {
let valueStr = "";
const begin = p;
while (isNumber()) {
valueStr += c;
next();
}
const end = p;
if (!valueStr) {
return null;
}
const value = Number(valueStr);
return {
type: "number",
begin,
end,
value,
};
}
function readComment() {
let body = "";
for (;;) {
while (c != "*") {
body += c;
next();
}
next();
if (c == "/") {
next();
return body;
}
body += "*";
body += c;
}
}
function readRegexp() {
const begin = p;
let value = "";
while (c != "/") {
value += c;
next();
}
next();
return {
type: "regexp",
value,
begin,
end: p,
};
}
function operator() {
const begin = p;
if (c == "=") {
next();
if (c == "=") {
next();
return {
type: "==",
begin,
end: p,
};
}
return {
type: "=",
begin,
end: p,
};
}
if (c == "+") {
next();
return {
type: "+",
begin,
end: p,
};
}
if (c == "*") {
next();
return {
type: "*",
begin,
end: p,
};
}
if (c == "/") {
next();
if (c == "*") {
next();
comment = readComment();
return "recur";
}
if (mode == "value") {
return readRegexp();
}
return {
type: "/",
begin,
end: p,
};
}
return null;
}
next();
for (;;) {
whitespace();
const maybeEof = eof();
if (maybeEof) {
yield maybeEof;
return;
}
const token = id() || operator() || number() || semicolon();
if (token == "recur") {
continue;
}
if (token) {
if (comment) {
token.comment = comment;
comment = undefined;
}
mode = yield token;
continue;
}
if (c == null) {
throw new SyntaxError(`unexpected EOF at ${p}`);
} else {
throw new SyntaxError(`unexpected symbol "${c}" at ${p}`);
}
}
}
function parse(tokens) {
let t = null;
function unexpected() {
return new ParseError(`unexpected token "${t.type}" at ${t.begin}`);
}
function expected(types) {
return new ParseError(
`unexpected token "${t.type}" at ${t.begin}, expected ${types
.map((t) => `"${t}"`)
.join(" or ")}`
);
}
// no pushing tokens back, going only forward
function next(mode) {
t = tokens.next(mode).value;
console.log("token", t && t.type);
}
const read = (types) => () => {
const _t = t;
if (types.includes(_t.type)) {
console.log("parse", "read", types);
next();
return _t;
}
throw expected(types);
};
const maybeRead = (types) => () => {
const _t = t;
if (types.includes(_t.type)) {
console.log("parse", "maybeRead", types);
next();
return _t;
}
return null;
};
const eof = read(["eof"]);
const maybeId = maybeRead(["id"]);
const subExpressionTokens = ["number", "id", "regexp"];
const maybeSubExpression = maybeRead(subExpressionTokens);
const subExpression = read(subExpressionTokens);
function tryMul(left) {
const type = t.type;
if (type != "*" && type != "/") return left;
console.log("parse", "tryMul");
next("value");
const right = tryMul(subExpression());
return {
type: "operator",
operator: type,
left,
right,
begin: left.begin,
end: right.end,
};
}
function tryMulOrSum(left) {
left = tryMul(left);
if (t.type != "+") return left;
console.log("parse", "tryMulOrSum");
next("value");
const right = tryMulOrSum(subExpression());
return {
type: "operator",
operator: "+",
left,
right,
begin: left.begin,
end: right.end,
};
}
function expression(left) {
const body = tryMulOrSum(left);
return {
type: "expression",
body,
begin: body.begin,
end: body.end,
};
}
function maybeAssignment(left) {
if (t.type != "=") return null;
console.log("parse", "maybeAssignment");
next("value");
const right = expression(subExpression());
return {
type: "assignment",
left,
right,
begin: left.begin,
end: right.end,
};
}
function maybeAssignmentOrExpression() {
const id = maybeId();
if (id) {
const assignment = maybeAssignment(id);
if (assignment) return assignment;
}
const left = id || maybeSubExpression();
if (!left) return null;
return expression(left);
}
// or closing curly brace wihout next
const endStatement = read([";", "eof"]);
function maybeStatement() {
if (t === undefined) return null;
const body = maybeAssignmentOrExpression();
const end = endStatement();
return {
type: "statement",
body,
begin: body ? body.begin : end.begin,
end: end.end,
};
}
function program() {
const body = [];
let node;
console.log("parse", "program");
next("value");
while ((node = maybeStatement())) {
body.push(node);
}
return {
type: "program",
body,
};
}
const node = program();
if (t !== undefined) {
throw new ParseError(`tokens left, next: "${t.type}"`);
}
return node;
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
console.log("\n\n");
const program = parse(
lex(`foo / 1 + 2 * 3 + 4 ; bar = /rex1/ /* bla-bla */ / /rex2/`)
);
console.dir(program, { depth: null });
console.log("done");
@peter-leonov
Copy link
Author

Like parsers? See also the Ruby one: https://github.com/peter-leonov/ruby-parser.js

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment