#include <iostream> #include <fstream> #include "tokenizer/Tokenizer.h" #include "utils/String.h" static void onError(const String& message, unsigned int line) { std::cout << message << " Line: " << line << "\n"; } static bool isLetter(char32_t c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool isDigit(char32_t c) { return c >= '0' && c <= '9'; } static bool isValidNameStart(char32_t c) { return isLetter(c) || c == '.' || c == '_'; } static bool isValidNamePart(char32_t c) { return isDigit(c) || isValidNameStart(c); } class Data { public: Data(const char* inputPath, TokenStream& tokens) : tokens(tokens) { stream.open(inputPath); } bool hasFileError() { return !stream.good(); } bool next(char32_t& c) { if(buffer != 0) { c = buffer; buffer = 0; return true; } c = stream.get(); return stream.good(); } bool peek(char32_t& c) { if(buffer != 0 || next(buffer)) { c = buffer; return true; } return false; } bool nextIf(char32_t c) { char32_t nextChar; if(peek(nextChar) && c == nextChar) { next(nextChar); return true; } return false; } void addToken(Token token) { tokens.add(token, line); } void addToken(Token token, const char* text) { tokens.add(token, line, text); } Token chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) { if(nextIf(c)) { if(nextIf('=')) { return aCharEqual; } return aChar; } else if(nextIf('=')) { return aEqual; } return other; } bool handleLiteral(char32_t c, Token token) { String s; s += (char) c; while(true) { if(s.isFull()) { onError("string buffer to small", line); return true; } char32_t data; if(!peek(data) || !isValidNamePart(data)) { break; } s += (char) data; next(data); } if(s == "if") { addToken(Token::IF); } else if(s == "else") { addToken(Token::ELSE); } else if(s == "elseif") { addToken(Token::ELSEIF); } else if(s == "while") { addToken(Token::WHILE); } else if(s == "try") { addToken(Token::TRY); } else if(s == "catch") { addToken(Token::CATCH); } else if(s == "for") { addToken(Token::FOR); } else if(s == "function") { addToken(Token::FUNCTION); } else if(s == "break") { addToken(Token::BREAK); } else if(s == "continue") { addToken(Token::CONTINUE); } else if(s == "return") { addToken(Token::RETURN); } else if(s == "true") { addToken(Token::TRUE); } else if(s == "false") { addToken(Token::FALSE); } else if(s == "null") { addToken(Token::NULL_TOKEN); } else { addToken(token, s); } return false; } bool handleNumber(char32_t c) { double number = c - '0'; char32_t data; while(peek(data)) { if(!isDigit(data)) { if(data != '.') { break; } next(data); double factor = 10; while(peek(data) && isDigit(data)) { number += (data - '0') / factor; factor *= 10; next(data); } break; } number = (number * 10) + (data - '0'); next(data); } tokens.add(Token::NUMBER, line, number); return false; } bool handleString() { String s; unsigned int oldLine = line; while(!s.isFull()) { char32_t data; if(!next(data)) { onError("non closed string literal", oldLine); return true; } if(data == '"') { addToken(Token::STRING, s); return false; } if(data == '\n') { line++; } if(data == '\\') { char32_t escape; if(!next(escape)) { onError("missing escaped character", line); return true; } switch(escape) { case 'n': data = '\n'; break; case '\\': data = '\\'; break; case '"': data = '"'; break; default: onError("invalid escaped character", line); return true; } } s += data; } onError("string buffer to small", line); return true; } bool handleOneLineComment() { char32_t data; while(next(data) && data != '\n'); line++; return false; } bool handleMultiLineComment() { char32_t first; char32_t sec = 0; unsigned int oldLine = line; while(true) { first = sec; if(!next(sec)) { onError("unclosed multiline comment", oldLine); return true; } if(first == '*' && sec == '/') { return false; } line += (sec == '\n'); } } bool handleSlash() { if(nextIf('/')) { return handleOneLineComment(); } else if(nextIf('*')) { return handleMultiLineComment(); } else if(nextIf('=')) { addToken(Token::DIV_SET); return false; } addToken(Token::DIV); return false; } bool handleSpecial(char32_t c) { switch(c) { case ' ': case '\t': case '\r': return false; case '\n': line++; return false; case '"': return handleString(); case '(': addToken(Token::OPEN_BRACKET); return false; case ')': addToken(Token::CLOSE_BRACKET); return false; case '[': addToken(Token::OPEN_SQUARE_BRACKET); return false; case ']': addToken(Token::CLOSE_SQUARE_BRACKET); return false; case '{': addToken(Token::OPEN_CURVED_BRACKET); return false; case '}': addToken(Token::CLOSE_CURVED_BRACKET); return false; case '$': return handleLiteral(c, Token::LITERAL); case '@': return handleLiteral(c, Token::LABEL); case ';': addToken(Token::SEMICOLON); return false; case ',': addToken(Token::COMMA); return false; case '~': addToken(Token::BIT_INVERT); return false; case '+': addToken(nextIf('=') ? Token::ADD_SET: (nextIf('+') ? Token::INC: Token::ADD)); return false; case '-': addToken(nextIf('=') ? Token::SUB_SET: (nextIf('-') ? Token::DEC: Token::SUB)); return false; case '!': addToken(nextIf('=') ? Token::NOT_EQUAL: Token::INVERT); break; case '=': addToken(nextIf('=') ? Token::EQUAL: Token::SET); return false; case '*': addToken(nextIf('=') ? Token::MUL_SET: Token::MUL); return false; case '/': return handleSlash(); case '%': addToken(nextIf('=') ? Token::MOD_SET: Token::MOD); return false; case '&': addToken(nextIf('=') ? Token::BIT_AND_SET: (nextIf('&') ? Token::AND: Token::BIT_AND)); return false; case '|': addToken(nextIf('=') ? Token::BIT_OR_SET: (nextIf('|') ? Token::OR: Token::BIT_OR)); return false; case '^': addToken(nextIf('=') ? Token::BIT_XOR_SET: Token::BIT_XOR); return false; case '<': addToken(chooseToken('<', Token::LEFT_SHIFT_SET, Token::LEFT_SHIFT, Token::LESS_EQUAL, Token::LESS)); return false; case '>': addToken(chooseToken('>', Token::RIGHT_SHIFT_SET, Token::RIGHT_SHIFT, Token::GREATER_EQUAL, Token::GREATER)); return false; } String s("unknown token '"); s += c; s += '\''; onError(s, line); return true; } bool handleChar(char32_t c) { if(isValidNameStart(c)) { return handleLiteral(c, Token::LITERAL); } else if(isDigit(c)) { return handleNumber(c); } return handleSpecial(c); } private: std::basic_ifstream<char32_t> stream; TokenStream& tokens; unsigned int line = 1; char32_t buffer = 0; }; bool Tokenizer::tokenize(TokenStream& tokenStream, const char* inputPath) { Data d(inputPath, tokenStream); if(d.hasFileError()) { return true; } char32_t c; while(d.next(c)) { if(d.handleChar(c)) { return true; } } d.addToken(Token::EOF_TOKEN); return false; }