Created
April 3, 2018 06:32
-
-
Save abenkovskii/c8a0e5084e399198c3bc50cef2602ac7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <iostream> | |
#include <algorithm> | |
#include <iterator> | |
#include <cctype> | |
using std::string; | |
using std::istream; | |
using std::find; | |
using std::begin; | |
using std::end; | |
class Token { | |
public: | |
enum class Type | |
{ | |
// kw | |
PROGRAM, | |
INT, | |
STRING, | |
BOOL, | |
REAL, | |
IF, | |
ELSE, | |
CASE, | |
OF, | |
END, | |
DO, | |
WHILE, | |
READ, | |
WRITE, | |
BREAK, | |
NOT, | |
AND, | |
OR, | |
TRUE, | |
FALSE, | |
// delim (single char) | |
OPEN_CURLY, | |
CLOSE_CURLY, | |
SEMICOLON, | |
COMMA, | |
OPEN_ROUND, | |
CLOSE_ROUND, | |
ASSIGN, | |
COLON, | |
MUL, | |
DIV, | |
MOD, | |
PLUS, | |
MINUS, | |
LT, | |
GT, | |
// delim (multiple char) | |
LE, | |
GE, | |
EQ, | |
NEQ, | |
// other | |
ID, | |
INT_CONST, | |
STRING_CONST, | |
REAL_CONST, | |
SPACE, | |
END_OF_FILE | |
}; | |
Token(Type t):type_(t) {} | |
Token(Type t, string s):type_(t), str_(s) {} | |
Token(Type t, int i):type_(t), int_(i) {} | |
Token(Type t, double d):type_(t), double_(d) {} | |
private: | |
Type type_; | |
int int_; | |
double double_; | |
string str_; | |
}; | |
namespace { | |
Token::Type kw_tok[] = { | |
Token::Type::PROGRAM, | |
Token::Type::INT, | |
Token::Type::STRING, | |
Token::Type::BOOL, | |
Token::Type::REAL, | |
Token::Type::IF, | |
Token::Type::ELSE, | |
Token::Type::CASE, | |
Token::Type::OF, | |
Token::Type::END, | |
Token::Type::DO, | |
Token::Type::WHILE, | |
Token::Type::READ, | |
Token::Type::WRITE, | |
Token::Type::BREAK, | |
Token::Type::NOT, | |
Token::Type::AND, | |
Token::Type::OR, | |
Token::Type::TRUE, | |
Token::Type::FALSE | |
}; | |
string kw[] = { | |
"program", | |
"int", | |
"string", | |
"bool", | |
"real", | |
"if", | |
"else", | |
"case", | |
"of", | |
"end", | |
"do", | |
"while", | |
"read", | |
"write", | |
"break", | |
"not", | |
"and", | |
"or", | |
"true", | |
"false" | |
}; | |
} | |
// XXX | |
typedef int UnexpectedCharacter; | |
int no_eof(int c) { | |
if(c == EOF) | |
throw UnexpectedCharacter(c); | |
return c; | |
} | |
bool is_id_char(int c) { | |
return isdigit(c) || ('a' <= c && c<= 'z') || ('A' <= c && c <= 'Z'); | |
} | |
bool my_isspace(int c) { | |
return c == ' ' || c == '\t' || c == '\n' || c == '\r'; | |
} | |
// XXX: can't find proof that one character putback is guarantied after reading | |
// XXX: are all this chars in the basic source char set? | |
Token next_token(istream &is) { | |
int c = is.get(); | |
switch(c) { | |
case '{': return Token::Type::OPEN_CURLY; | |
case '}': return Token::Type::CLOSE_CURLY; | |
case ';': return Token::Type::SEMICOLON; | |
case ',': return Token::Type::COMMA; | |
case '(': return Token::Type::OPEN_ROUND; | |
case ')': return Token::Type::CLOSE_ROUND; | |
case ':': return Token::Type::COLON; | |
case '*': return Token::Type::MUL; | |
case '%': return Token::Type::MOD; | |
case '+': return Token::Type::PLUS; | |
case '-': return Token::Type::MINUS; | |
case EOF: return Token::Type::END_OF_FILE; | |
case '=': | |
if((c = no_eof(is.get())) == '=') | |
return Token::Type::EQ; | |
is.unget(); | |
return Token::Type::ASSIGN; | |
case '<': | |
if((c = no_eof(is.get())) == '=') | |
return Token::Type::LE; | |
is.unget(); | |
return Token::Type::LT; | |
case '>': | |
if((c = no_eof(is.get())) == '=') | |
return Token::Type::GE; | |
is.unget(); | |
return Token::Type::GT; | |
case '!': | |
if((c = no_eof(is.get())) == '=') | |
return Token::Type::NEQ; | |
throw UnexpectedCharacter(c); | |
// "/*/" -- not a comment | |
// "/* * */" -- comment | |
// "/* * / */" -- comment | |
// "/* **/" -- comment | |
case '/': | |
if((c = no_eof(is.get())) != '*') { | |
is.unget(); | |
return Token::Type::DIV; | |
} | |
goto comment; | |
comment: | |
if(no_eof(is.get()) == '*') | |
goto asterix_found; | |
else | |
goto comment; | |
asterix_found: | |
switch(no_eof(is.get())) { | |
case '/': | |
goto done; | |
case '*': | |
goto asterix_found; | |
default: | |
goto comment; | |
} | |
done: | |
return Token::Type::SPACE; | |
case '"': | |
{ | |
string s; | |
while((c = no_eof(is.get())) != '"') | |
s.push_back(c); | |
return Token(Token::Type::STRING_CONST, s); | |
} | |
default: | |
if(my_isspace(c)) { | |
while(my_isspace(c = is.get())) | |
continue; | |
if(c != EOF) | |
is.unget(); | |
return Token::Type::SPACE; | |
} else if(isdigit(c)) { | |
// TODO: when writing parser don't forget that constants can have signs | |
// bonus points: "- /* comment */ 3.14" is technically not a constant | |
string s; | |
for(;isdigit(c); c=no_eof(is.get())) | |
s.push_back(c); | |
if(c != '.') { | |
is.unget(); | |
return Token(Token::Type::INT_CONST, stoi(s)); | |
} | |
s.push_back('.'); | |
// XXX "314." should not be valid | |
for(c=no_eof(is.get()); isdigit(c); c=no_eof(is.get())) | |
s.push_back(c); | |
is.unget(); | |
return Token(Token::Type::REAL_CONST, stod(s)); | |
} else if (is_id_char(c)) { | |
string s; | |
for(;is_id_char(c); c=no_eof(is.get())) | |
s.push_back(c); | |
is.unget(); | |
auto k = find(begin(kw), end(kw), s); | |
if(k != end(kw)) | |
return Token(kw_tok[k - begin(kw)]); | |
return Token(Token::Type::ID, s); | |
} | |
throw UnexpectedCharacter(c); | |
} | |
} | |
int main() { | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment