abenkovskii · April 3, 2018 06:32
diff --git a/main.cpp b/main.cpp
 #include <string>
 #include <iostream>
 #include <algorithm>
 #include <iterator>
 #include <cctype>

 using std::string;
 using std::istream;
 using std::find;
 using std::begin;
 using std::end;

 class Token {
 	public:
 		enum class Type
 		{
 			// kw
 			PROGRAM,
 			INT,
 			STRING,
 			BOOL,
 			REAL,
 			IF,
 			ELSE,
 			CASE,
 			OF,
 			END,
 			DO,
 			WHILE,
 			READ,
 			WRITE,
 			BREAK,
 			NOT,
 			AND,
 			OR,
 			TRUE,
 			FALSE,

 			// delim (single char)
 			OPEN_CURLY,
 			CLOSE_CURLY,
 			SEMICOLON,
 			COMMA,
 			OPEN_ROUND,
 			CLOSE_ROUND,
 			ASSIGN,
 			COLON,
 			MUL,
 			DIV,
 			MOD,
 			PLUS,
 			MINUS,
 			LT,
 			GT,

 			// delim (multiple char)
 			LE,
 			GE,
 			EQ,
 			NEQ,

 			// other
 			ID,
 			INT_CONST,
 			STRING_CONST,
 			REAL_CONST,
 			SPACE,
 			END_OF_FILE
 		};

 		Token(Type t):type_(t) {}
 		Token(Type t, string s):type_(t), str_(s) {}
 		Token(Type t, int i):type_(t), int_(i) {}
 		Token(Type t, double d):type_(t), double_(d) {}

 	private:
 		Type type_;
 		int int_;
 		double double_;
 		string str_;
 };

 namespace {
 	Token::Type kw_tok[] = {
 		Token::Type::PROGRAM,
 		Token::Type::INT,
 		Token::Type::STRING,
 		Token::Type::BOOL,
 		Token::Type::REAL,
 		Token::Type::IF,
 		Token::Type::ELSE,
 		Token::Type::CASE,
 		Token::Type::OF,
 		Token::Type::END,
 		Token::Type::DO,
 		Token::Type::WHILE,
 		Token::Type::READ,
 		Token::Type::WRITE,
 		Token::Type::BREAK,
 		Token::Type::NOT,
 		Token::Type::AND,
 		Token::Type::OR,
 		Token::Type::TRUE,
 		Token::Type::FALSE
 	};
 	
 	string kw[] = {
 		"program",
 		"int",
 		"string",
 		"bool",
 		"real",
 		"if",
 		"else",
 		"case",
 		"of",
 		"end",
 		"do",
 		"while",
 		"read",
 		"write",
 		"break",
 		"not",
 		"and",
 		"or",
 		"true",
 		"false"
 	};
 }

 // XXX
 typedef int UnexpectedCharacter;

 int no_eof(int c) {
 	if(c == EOF)
 		throw UnexpectedCharacter(c);
 	return c;
 }

 bool is_id_char(int c) {
 	return isdigit(c) || ('a' <= c && c<= 'z') || ('A' <= c && c <= 'Z');
 }

 bool my_isspace(int c) {
 	return c == ' ' || c == '\t' || c == '\n' || c == '\r';  
 }

 // XXX: can't find proof that one character putback is guarantied after reading
 // XXX: are all this chars in the basic source char set?
 Token next_token(istream &is) {
 	int c = is.get();
 	switch(c) {
 		case '{': return Token::Type::OPEN_CURLY;
 		case '}': return Token::Type::CLOSE_CURLY;
 		case ';': return Token::Type::SEMICOLON;
 		case ',': return Token::Type::COMMA;
 		case '(': return Token::Type::OPEN_ROUND;
 		case ')': return Token::Type::CLOSE_ROUND;
 		case ':': return Token::Type::COLON;
 		case '*': return Token::Type::MUL;
 		case '%': return Token::Type::MOD;
 		case '+': return Token::Type::PLUS;
 		case '-': return Token::Type::MINUS;
 		case EOF: return Token::Type::END_OF_FILE;
 		case '=':
 			if((c = no_eof(is.get())) == '=') 
 				return Token::Type::EQ;
 			is.unget();
 			return Token::Type::ASSIGN;
 		case '<':
 			if((c = no_eof(is.get())) == '=') 
 				return Token::Type::LE;
 			is.unget();
 			return Token::Type::LT;
 		case '>':
 			if((c = no_eof(is.get())) == '=') 
 				return Token::Type::GE;
 			is.unget();
 			return Token::Type::GT;
 		case '!':
 			if((c = no_eof(is.get())) == '=') 
 				return Token::Type::NEQ;
 			throw UnexpectedCharacter(c);


 		// "/*/" -- not a comment
 		// "/* * */" -- comment
 		// "/* * / */" -- comment
 		// "/*   **/" -- comment
 		case '/':
 			if((c = no_eof(is.get())) != '*') {
 				is.unget();
 				return Token::Type::DIV;
 			}

 			goto comment;
 comment:
 			if(no_eof(is.get()) == '*')
 				goto asterix_found;
 			else
 				goto comment;
 asterix_found:
 			switch(no_eof(is.get())) {
 				case '/':
 					goto done;
 				case '*':
 					goto asterix_found;
 				default:
 					goto comment;
 			}
 done:
 			return Token::Type::SPACE;

 		case '"':
 			{
 				string s;
 				while((c = no_eof(is.get())) != '"')
 					s.push_back(c);
 				return Token(Token::Type::STRING_CONST, s);
 			}

 		default:
 			if(my_isspace(c)) {
 				while(my_isspace(c = is.get()))
 					continue;
 				if(c != EOF)
 					is.unget();
 				return Token::Type::SPACE;
 			} else if(isdigit(c)) {
 				// TODO: when writing parser don't forget that constants can have signs
 				// bonus points: "- /* comment */ 3.14" is technically not a constant
 				string s;
 				for(;isdigit(c); c=no_eof(is.get()))
 					s.push_back(c);
 				if(c != '.') {
 					is.unget();
 					return Token(Token::Type::INT_CONST, stoi(s));
 				}
 				s.push_back('.');
 				// XXX "314." should not be valid
 				for(c=no_eof(is.get()); isdigit(c); c=no_eof(is.get()))
 					s.push_back(c);
 				is.unget();
 				return Token(Token::Type::REAL_CONST, stod(s));
 			} else if (is_id_char(c)) {
 				string s;
 				for(;is_id_char(c); c=no_eof(is.get()))
 					s.push_back(c);
 				is.unget();
 				auto k = find(begin(kw), end(kw), s);
 				if(k != end(kw))
 					return Token(kw_tok[k - begin(kw)]);
 				return Token(Token::Type::ID, s);
 			}
 			throw UnexpectedCharacter(c);
 	}
 }

 int main() {

 	return 0;
 }
	#include <string>
	#include <iostream>
	#include <algorithm>
	#include <iterator>
	#include <cctype>

	using std::string;
	using std::istream;
	using std::find;
	using std::begin;
	using std::end;

	class Token {
	public:
	enum class Type
	{
	// kw
	PROGRAM,
	INT,
	STRING,
	BOOL,
	REAL,
	IF,
	ELSE,
	CASE,
	OF,
	END,
	DO,
	WHILE,
	READ,
	WRITE,
	BREAK,
	NOT,
	AND,
	OR,
	TRUE,
	FALSE,

	// delim (single char)
	OPEN_CURLY,
	CLOSE_CURLY,
	SEMICOLON,
	COMMA,
	OPEN_ROUND,
	CLOSE_ROUND,
	ASSIGN,
	COLON,
	MUL,
	DIV,
	MOD,
	PLUS,
	MINUS,
	LT,
	GT,

	// delim (multiple char)
	LE,
	GE,
	EQ,
	NEQ,

	// other
	ID,
	INT_CONST,
	STRING_CONST,
	REAL_CONST,
	SPACE,
	END_OF_FILE
	};

	Token(Type t):type_(t) {}
	Token(Type t, string s):type_(t), str_(s) {}
	Token(Type t, int i):type_(t), int_(i) {}
	Token(Type t, double d):type_(t), double_(d) {}

	private:
	Type type_;
	int int_;
	double double_;
	string str_;
	};

	namespace {
	Token::Type kw_tok[] = {
	Token::Type::PROGRAM,
	Token::Type::INT,
	Token::Type::STRING,
	Token::Type::BOOL,
	Token::Type::REAL,
	Token::Type::IF,
	Token::Type::ELSE,
	Token::Type::CASE,
	Token::Type::OF,
	Token::Type::END,
	Token::Type::DO,
	Token::Type::WHILE,
	Token::Type::READ,
	Token::Type::WRITE,
	Token::Type::BREAK,
	Token::Type::NOT,
	Token::Type::AND,
	Token::Type::OR,
	Token::Type::TRUE,
	Token::Type::FALSE
	};

	string kw[] = {
	"program",
	"int",
	"string",
	"bool",
	"real",
	"if",
	"else",
	"case",
	"of",
	"end",
	"do",
	"while",
	"read",
	"write",
	"break",
	"not",
	"and",
	"or",
	"true",
	"false"
	};
	}

	// XXX
	typedef int UnexpectedCharacter;

	int no_eof(int c) {
	if(c == EOF)
	throw UnexpectedCharacter(c);
	return c;
	}

	bool is_id_char(int c) {
	return isdigit(c) \|\| ('a' <= c && c<= 'z') \|\| ('A' <= c && c <= 'Z');
	}

	bool my_isspace(int c) {
	return c == ' ' \|\| c == '\t' \|\| c == '\n' \|\| c == '\r';
	}

	// XXX: can't find proof that one character putback is guarantied after reading
	// XXX: are all this chars in the basic source char set?
	Token next_token(istream &is) {
	int c = is.get();
	switch(c) {
	case '{': return Token::Type::OPEN_CURLY;
	case '}': return Token::Type::CLOSE_CURLY;
	case ';': return Token::Type::SEMICOLON;
	case ',': return Token::Type::COMMA;
	case '(': return Token::Type::OPEN_ROUND;
	case ')': return Token::Type::CLOSE_ROUND;
	case ':': return Token::Type::COLON;
	case '*': return Token::Type::MUL;
	case '%': return Token::Type::MOD;
	case '+': return Token::Type::PLUS;
	case '-': return Token::Type::MINUS;
	case EOF: return Token::Type::END_OF_FILE;
	case '=':
	if((c = no_eof(is.get())) == '=')
	return Token::Type::EQ;
	is.unget();
	return Token::Type::ASSIGN;
	case '<':
	if((c = no_eof(is.get())) == '=')
	return Token::Type::LE;
	is.unget();
	return Token::Type::LT;
	case '>':
	if((c = no_eof(is.get())) == '=')
	return Token::Type::GE;
	is.unget();
	return Token::Type::GT;
	case '!':
	if((c = no_eof(is.get())) == '=')
	return Token::Type::NEQ;
	throw UnexpectedCharacter(c);


	// "/*/" -- not a comment
	// "/* * */" -- comment
	// "/* * / */" -- comment
	// "/* **/" -- comment
	case '/':
	if((c = no_eof(is.get())) != '*') {
	is.unget();
	return Token::Type::DIV;
	}

	goto comment;
	comment:
	if(no_eof(is.get()) == '*')
	goto asterix_found;
	else
	goto comment;
	asterix_found:
	switch(no_eof(is.get())) {
	case '/':
	goto done;
	case '*':
	goto asterix_found;
	default:
	goto comment;
	}
	done:
	return Token::Type::SPACE;

	case '"':
	{
	string s;
	while((c = no_eof(is.get())) != '"')
	s.push_back(c);
	return Token(Token::Type::STRING_CONST, s);
	}

	default:
	if(my_isspace(c)) {
	while(my_isspace(c = is.get()))
	continue;
	if(c != EOF)
	is.unget();
	return Token::Type::SPACE;
	} else if(isdigit(c)) {
	// TODO: when writing parser don't forget that constants can have signs
	// bonus points: "- /* comment */ 3.14" is technically not a constant
	string s;
	for(;isdigit(c); c=no_eof(is.get()))
	s.push_back(c);
	if(c != '.') {
	is.unget();
	return Token(Token::Type::INT_CONST, stoi(s));
	}
	s.push_back('.');
	// XXX "314." should not be valid
	for(c=no_eof(is.get()); isdigit(c); c=no_eof(is.get()))
	s.push_back(c);
	is.unget();
	return Token(Token::Type::REAL_CONST, stod(s));
	} else if (is_id_char(c)) {
	string s;
	for(;is_id_char(c); c=no_eof(is.get()))
	s.push_back(c);
	is.unget();
	auto k = find(begin(kw), end(kw), s);
	if(k != end(kw))
	return Token(kw_tok[k - begin(kw)]);
	return Token(Token::Type::ID, s);
	}
	throw UnexpectedCharacter(c);
	}
	}

	int main() {

	return 0;
	}