Skip to content

Instantly share code, notes, and snippets.

@ShawSumma
Created April 10, 2021 04:30
Show Gist options
  • Save ShawSumma/e1fa4dcdf01b2916f58c54c90e677af0 to your computer and use it in GitHub Desktop.
Save ShawSumma/e1fa4dcdf01b2916f58c54c90e677af0 to your computer and use it in GitHub Desktop.
module source.app;
import std.algorithm;
import std.stdio;
import std.string;
import std.array;
import std.file;
import std.conv : to;
import core.stdc.ctype;
class Grammar
{
string start;
string[][][string] productions;
this(string start_ = "start:")
{
start = start_;
}
void add(string left, string[] right)
{
if (left !in productions)
{
productions[left] = [right];
}
else
{
productions[left] ~= right;
}
}
bool isTerm(string sym)
{
return !isNonTerm(sym);
}
bool isNonTerm(string sym)
{
return !(sym !in productions);
}
override string toString()
{
string ret;
foreach (key, rules; productions)
{
ret ~= key[1 .. $];
ret ~= ": ";
foreach (index1, rule; rules)
{
if (index1 != 0)
{
ret ~= " | ";
}
foreach (index2, value; rule)
{
if (index2 != 0)
{
ret ~= ", ";
}
if (isTerm(value))
{
if (value == "\n")
{
ret ~= "'\\n'";
}
else if (value == "\t")
{
ret ~= "'\\t'";
}
else if (value == "'")
{
ret ~= "'\\'";
}
else if (value == "\r")
{
ret ~= "'\r'";
}
else
{
ret ~= "'" ~ value ~ "'";
}
}
else
{
ret ~= value[1 .. $];
}
}
}
ret ~= '\n';
}
return ret;
}
}
final class Row
{
size_t dot;
string left;
string[] right;
size_t start;
size_t end;
Row[] completes;
this(size_t dot_, string left_, string[] right_, size_t start_, size_t end_,
Row[] completes_ = null)
{
dot = dot_;
left = left_;
right = right_;
completes = completes_;
start = start_;
end = end_;
}
string next()
{
if (dot < right.length)
{
return right[dot];
}
return "";
}
bool isComplete()
{
return dot == right.length;
}
override bool opEquals(Object obj)
{
Row row = cast(Row) obj;
if (row is null)
{
return false;
}
return left == row.left && right == row.right && dot == row.dot
&& start == row.start && end == row.end;
}
}
final class Table
{
size_t k;
Row[] rows;
this(size_t k_)
{
k = k_;
}
void addRow(Row row, Row complete = null)
{
if (!rows.canFind(row))
{
rows ~= row;
}
if (complete !is null && !row.completes.canFind(complete))
{
row.completes ~= complete;
}
}
size_t length()
{
return rows.length;
}
}
class Parser
{
Grammar grammar;
Table[] tables;
string[][] words;
this(Grammar grammar_)
{
grammar = grammar_;
}
Node parse(string str)
{
string[] nextWords;
foreach (chr; str)
{
nextWords ~= [chr];
}
run(nextWords);
Row[] comp = completes;
if (comp.length != 1)
{
throw new Exception("parse error: unknown");
}
Node ret = makeNode(comp[0]);
return ret.children[0];
}
void run(string[] words_)
{
grammar.productions["GAMMA"] = [[grammar.start]];
scope (exit)
{
grammar.productions.remove("GAMMA");
}
words = null;
foreach (word; words_)
{
words ~= [word];
}
tables = null;
foreach (i; 0 .. words.length + 1)
{
addTable(i);
}
tables[0].addRow(new Row(0, "", ["GAMMA"], 0, 0));
size_t[] last;
foreach (i; 0 .. words.length + 1)
{
size_t[][] cur = tables[i].rows.map!(x => [x.start, x.end]).array;
if (cur.length == 0)
{
throw new Exception("parse error: char #" ~ to!string(last[0] + 1));
}
last = cur[0];
size_t index = 0;
while (index < tables[i].rows.length)
{
Row row = tables[i].rows[index];
if (!row.isComplete)
{
if (grammar.isNonTerm(row.next))
{
predict(row);
}
else
{
scan(row);
}
}
else
{
complete(row);
}
index++;
}
}
}
void addTable(size_t k)
{
tables ~= new Table(k);
}
void predict(Row row)
{
string nextRow = row.next;
if (string[][]* rules = nextRow in grammar.productions)
{
foreach (rule; *rules)
{
Row newRow = new Row(0, nextRow, rule, row.end, row.end);
tables[row.end].addRow(newRow);
}
}
}
void scan(Row row)
{
string nextSymbol = row.next;
if (row.end < words.length)
{
string actual = words[row.end][0];
if (nextSymbol == actual)
{
Row newRow = new Row(1, nextSymbol, [actual], row.end, row.end + 1);
tables[row.end + 1].addRow(newRow);
}
}
}
void complete(Row row)
{
size_t index = 0;
while (index < tables[row.start].rows.length)
{
Row oldRow = tables[row.start].rows[index];
if (!oldRow.isComplete && oldRow.right[oldRow.dot] == row.left)
{
Row newRow = new Row(oldRow.dot + 1, oldRow.left, oldRow.right,
oldRow.start, row.end, oldRow.completes.dup);
tables[row.end].addRow(newRow, row);
}
index++;
}
}
Node makeNode(Row row, Row[] relatives = null)
{
return new Node(this, row, relatives);
}
Row[] completes()
{
Row[] completes = null;
foreach (row; tables[$ - 1].rows)
{
if (row.left == "GAMMA")
{
completes ~= row;
}
}
return completes;
}
}
class Node
{
bool term;
string type;
string data;
Node[] children;
this(string type_, string data_)
{
type = type_;
data = data_;
term = true;
}
this(string type_, Node[] children_)
{
type = type_;
children = children_;
term = false;
}
bool isTerm()
{
return term;
}
this(Parser parser, Row row, Row[] relatives)
{
if (row.left.length == 1)
{
type = row.left;
data = row.left;
term = true;
}
else
{
type = row.left;
term = false;
foreach (subRow; row.completes)
{
children ~= parser.makeNode(subRow);
}
if (row.completes.length != 0)
{
relatives ~= row;
}
if (row.left == "GAMMA")
{
foreach (relative; relatives)
{
if (relative.start < parser.words.length)
{
string word = parser.words[relative.start][0];
children ~= new Node(word, word);
}
}
}
}
}
}
void pprint(Node node, size_t level = 0)
{
foreach (i; 0 .. level)
{
write(" ");
}
if (node.isTerm)
{
writeln("term: ", node.data);
}
else
{
writeln("node: ", node.type[1 .. $]);
}
foreach (child; node.children)
{
pprint(child, level + 1);
}
}
string nameToString(Node name)
{
string ret;
foreach (child; name.children)
{
if (child.isTerm)
{
ret ~= child.type;
}
else
{
ret ~= child.nameToString;
}
}
return ret;
}
class Walker
{
size_t symno;
Grammar grammar;
string[] ignores;
string[string] names;
string[][string] nameFlags;
this()
{
names["start"] = ":start";
grammar = new Grammar(":start");
}
string genSym(Args...)(Args args)
{
symno++;
string parens;
string[] flags;
static foreach (arg; args)
{
flags ~= arg;
}
string name = ":tmp#" ~ symno.to!string;
nameFlags[name] = flags;
// writeln(name, ": ", flags);
return name;
}
string[] getFlags(Node node)
{
if (string[]* pflags = node.type in nameFlags)
{
return *pflags;
}
return null;
}
Node clean(Node node)
{
if (node.isTerm)
{
return node;
}
string[] baseFlags = getFlags(node);
Node[] children;
foreach (child; node.children)
{
Node cchild = clean(child);
if (cchild.isTerm)
{
children ~= cchild;
continue;
}
string[] flags = getFlags(cchild);
if (flags.canFind("discard"))
{
continue;
}
if (flags.canFind("inline"))
{
children ~= cchild.children;
continue;
}
children ~= cchild;
}
if (baseFlags.canFind("norec"))
{
Node[] nextChildren;
foreach (child; children)
{
if (!child.isTerm && child.type == node.type)
{
nextChildren ~= child.children;
}
else
{
nextChildren ~= child;
}
}
children = nextChildren;
}
if (baseFlags.canFind("term"))
{
return new Node(node.type, node.nameToString);
}
else
{
return new Node(node.type, children);
}
}
string[] walk(Node node)
{
if (ignores.length == 0)
{
return walk2(node);
}
string name = genSym("inline");
string[] after = walk2(node);
grammar.add(name, after);
foreach (ignore; ignores)
{
grammar.add(name, [ignore, name]);
}
return [name];
}
string[] walk2(Node node)
{
switch (node.type)
{
default:
throw new Exception("err: " ~ node.type);
case ":ignore":
string name = genSym("discard");
grammar.add(name, walk(node.children[$ - 3]));
ignores ~= name;
walk(node.children[$ - 1]);
ignores.length--;
return null;
case ":block":
walk(node.children[2]);
return null;
case ":rules":
foreach (child; node.children)
{
if (child.type != ":ws")
{
walk(child);
}
}
return null;
case ":rule":
string varName = node.children[0].nameToString;
string name1 = ':' ~ varName;
string[] flags;
if (node.children[2].children.length == 6)
{
Node cur = node.children[2].children[2];
while (cur.children.length == 5)
{
flags ~= cur.children[0].nameToString;
cur = cur.children[$ - 1];
}
flags ~= cur.children[0].nameToString;
}
nameFlags[name1] = flags;
// writeln(name1, ": ", flags);
if (flags.length != 0)
{
string name2 = genSym("inline");
grammar.add(name1, [name2]);
grammar.add(name2, walk(node.children[$ - 1]));
}
else
{
grammar.add(name1, walk(node.children[$ - 1]));
}
return null;
case ":xor":
if (node.children.length != 1)
{
string name = genSym("inline");
grammar.add(name, walk(node.children[0]));
grammar.add(name, walk(node.children[$ - 1]));
return [name];
}
else
{
return walk(node.children[0]);
}
case ":and":
string[] rets;
if (node.children.length != 1)
{
// string name = genSym("inline");
// grammar.add(name, walk(node.children[0]) ~ walk(node.children[$ - 1]));
// return [name];
return walk(node.children[0]) ~ walk(node.children[$ - 1]);
}
else
{
return walk(node.children[0]);
}
case ":postfix":
if (node.children.length != 1)
{
char op = node.children[$ - 1].data[0];
string name = genSym("inline");
final switch (op)
{
case '+':
string[] rules = walk(node.children[0]);
grammar.add(name, rules ~ name);
grammar.add(name, rules);
break;
case '*':
grammar.add(name, walk(node.children[0]) ~ name);
grammar.add(name, []);
break;
case '!':
nameFlags[name] = ["discard"];
grammar.add(name, walk(node.children[0]));
break;
case '?':
grammar.add(name, walk(node.children[0]));
grammar.add(name, []);
break;
}
return [name];
}
else
{
return walk(node.children[0]);
}
case ":single":
return walk(node.children[0]);
case ":wrap":
return walk(node.children[2]);
case ":name":
return [':' ~ node.nameToString];
case ":char_range":
char first = node.children[0].children[1].children[0].type[0];
char last = node.children[$ - 1].children[1].children[0].type[0];
string name = genSym("inline");
foreach (char c; first .. last + 1)
{
grammar.add(name, [[c]]);
}
return [name];
case ":char":
return [node.children[1].children[0].type];
// return walk2(node.children[1]);
case ":char_body":
if (node.children.length == 1)
{
return [node.children[0].children[0].type];
}
final switch (node.children[1].data[0])
{
case 'n':
return ["\n"];
case 'r':
return ["\n"];
case 't':
return ["\t"];
}
}
}
}
Walker grammar(string of)
{
Grammar grammar = new Grammar(":rules");
grammar.add(":rules", []);
grammar.add(":rules", [":rules", ":rules"]);
grammar.add(":rules", [":ws", ":rule"]);
grammar.add(":rules", [":ws", ":ignore"]);
grammar.add(":rule", [":name", ":ws", ":rule_args", ":ws", ":xor"]);
grammar.add(":ignore", [
"%", ":ws", "i", "g", "n", "o", "r", "e", ":ws", ":xor", ":ws",
":block"
]);
grammar.add(":rule_args", [":"]);
grammar.add(":rule_args", ["(", ":ws", ")", ":ws", ":"]);
grammar.add(":rule_args", ["(", ":ws", ":rule_args_body", ")", ":ws", ":"]);
grammar.add(":rule_args_body", [
":name", ":ws", ",", ":ws", ":rule_args_body"
]);
grammar.add(":rule_args_body", [":name"]);
grammar.add(":block", ["{", ":ws", ":rules", ":ws", "}"]);
grammar.add(":xor", [":xor", ":ws", "|", ":ws", ":and"]);
grammar.add(":xor", [":and"]);
grammar.add(":and", [":and", ":ws", ",", ":ws", ":postfix"]);
grammar.add(":and", [":postfix"]);
grammar.add(":postfix", [":postfix", "!"]);
grammar.add(":postfix", [":single", "*"]);
grammar.add(":postfix", [":single", "?"]);
grammar.add(":postfix", [":single", "+"]);
grammar.add(":postfix", [":single"]);
grammar.add(":single", [":name"]);
grammar.add(":single", [":char"]);
grammar.add(":single", [":char_range"]);
grammar.add(":single", [":wrap"]);
grammar.add(":wrap", ["(", ":ws", ":xor", ":ws", ")"]);
grammar.add(":char_range", [":char", ":ws", ".", ".", ":ws", ":char"]);
grammar.add(":raw_char", ["'", ":any", "'"]);
grammar.add(":char", ["'", ":any", "'"]);
grammar.add(":char_body", [":any"]);
grammar.add(":char_body", ["\\", "n"]);
grammar.add(":char_body", ["\\", "r"]);
grammar.add(":char_body", ["\\", "t"]);
grammar.add(":char_body", ["\\", "'"]);
grammar.add(":name", ["_", ":name"]);
grammar.add(":name", [":letter", ":name"]);
grammar.add(":name", [":letter"]);
grammar.add(":name", ["_"]);
foreach (char i; 'a' .. 'z' + 1)
{
grammar.add(":letter", [[i]]);
}
foreach (char i; 'A' .. 'Z' + 1)
{
grammar.add(":letter", [[i]]);
}
foreach (char i; 0 .. char.max)
{
if (isprint(i))
{
grammar.add(":any", [[i]]);
}
}
grammar.add(":ws", []);
grammar.add(":ws", [" ", ":ws"]);
grammar.add(":ws", ["\t", ":ws"]);
grammar.add(":ws", ["\r", ":ws"]);
grammar.add(":ws", ["\n", ":ws"]);
// grammar.add(":ws", [":ws", ":ws"]);
Parser parser = new Parser(grammar);
Node ast = parser.parse(of);
Walker walker = new Walker;
walker.walk(ast);
return walker;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment