#include #include "tokenizer/Tokenizer.h" #include "exceptions/PreScriptException.h" static unsigned int line = 1; static std::vector* tokens = nullptr; static std::istream* input = nullptr; static int buffer = -1; static bool isLetter(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool isDigit(int c) { return c >= '0' && c <= '9'; } static bool isValidNamePart(int c) { return isLetter(c) || isDigit(c) || c == '.' || c == '_'; } static int next() { if(buffer != -1) { int r = buffer; buffer = -1; return r; } int data = input->get(); if(!input->good()) { return -1; } if((data & 0x80) != 0 && data != -1) // special char { if((data & 0x40) != 0) // this should always be true { if((data & 0x20) != 0) // 3 byte unicode { int a = input->get(); int b = input->get(); data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF); } else // 2 byte unicode { data = ((data & 0xFF) << 8) | (input->get() & 0xFF); } } else { // should not happen as unicode starts with 11 } } return data; } static int peek() { if(buffer == -1) { buffer = next(); return buffer; } return buffer; } static bool next(char c) { if(peek() == c) { next(); return true; } return false; } static void add(TokenType type) { tokens->push_back(Token(type, line)); } static void add(TokenType type, double number) { tokens->push_back(Token(type, line, number)); } static void add(TokenType type, const std::string& text) { tokens->push_back(Token(type, line, text)); } static void add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4) { int p = peek(); if(p == c) { next(); if(peek() == '=') { next(); add(t1); } else { add(t2); } } else if(p == '=') { next(); add(t3); } else { add(t4); } } static void handleLiteral(int c, TokenType type) { std::stringstream sBuilder; sBuilder << (char) c; while(true) { int data = peek(); if(!isValidNamePart(data)) { break; } sBuilder << (char) data; next(); } std::string s = sBuilder.str(); if(s == "if") { add(TokenType::IF); } else if(s == "if") { add(TokenType::IF); } else if(s == "else") { add(TokenType::ELSE); } else if(s == "elseif") { add(TokenType::ELSEIF); } else if(s == "while") { add(TokenType::WHILE); } else if(s == "try") { add(TokenType::TRY); } else if(s == "catch") { add(TokenType::CATCH); } else if(s == "for") { add(TokenType::FOR); } else if(s == "function") { add(TokenType::FUNCTION); } else if(s == "break") { add(TokenType::BREAK); } else if(s == "continue") { add(TokenType::CONTINUE); } else if(s == "return") { add(TokenType::RETURN); } else if(s == "true") { add(TokenType::TRUE); } else if(s == "false") { add(TokenType::FALSE); } else if(s == "null") { add(TokenType::NULL_TOKEN); } else { add(type, s); }; } static void handleNumber(int c) { double d = c - '0'; while(true) { int data = peek(); if(!isDigit(data)) { if(data == '.') { next(); double factor = 10; while(true) { int data = peek(); if(!isDigit(data)) { break; } d += (data - '0') / factor; factor *= 10; next(); } } break; } d = (d * 10) + (data - '0'); next(); } add(NUMBER, d); } static void handleString() { std::stringstream ss; int oldLine = line; while(true) { int data = next(); if(data == -1) { throw PreScriptException("non closed string literal", oldLine); } if(data == '"') { add(STRING, ss.str()); break; } if(data == '\n') { line++; } if(data == '\\') { int escape = next(); switch(escape) { case 'n': data = '\n'; break; case '\\': data = '\\'; break; case '"': data = '"'; break; default: throw PreScriptException("invalid escaped character", line); } } if(data > 0xFFFF) { ss << (char) ((data & 0xFF0000) >> 16); ss << (char) ((data & 0xFF00) >> 8); ss << (char) (data & 0xFF); } else if(data > 0xFF) { ss << (char) ((data & 0xFF00) >> 8); ss << (char) (data & 0xFF); } else { ss << (char) data; } } } static void handleOneLineComment() { while(true) { int data = next(); if(data == -1 || data == '\n') { line++; break; } } } static void handleMultiLineComment() { int first; int sec = -1; while(true) { first = sec; sec = next(); if(sec == -1 || (first == '*' && sec == '/')) { break; } if(sec == '\n') { line++; } } } static void handleSlash() { switch(peek()) { case '/': next(); handleOneLineComment(); break; case '*': next(); handleMultiLineComment(); break; case '=': next(); add(DIV_SET); break; default: add(DIV); } } static void handleSpecial(int c) { switch(c) { case ' ': case '\t': case '\r': break; case '\n': line++; break; case '"': handleString(); break; case '(': add(OPEN_BRACKET); break; case ')': add(CLOSE_BRACKET); break; case '[': add(OPEN_SQUARE_BRACKET); break; case ']': add(CLOSE_SQUARE_BRACKET); break; case '{': add(OPEN_CURVED_BRACKET); break; case '}': add(CLOSE_CURVED_BRACKET); break; case '$': handleLiteral(c, LITERAL); break; case '@': handleLiteral(c, LABEL); break; case ';': add(SEMICOLON); break; case ',': add(COMMA); break; case '~': add(BIT_INVERT); break; case '+': add(next('=') ? ADD_SET: (next('+') ? INC: ADD)); break; case '-': add(next('=') ? SUB_SET: (next('-') ? DEC: SUB)); break; case '!': add(next('=') ? NOT_EQUAL: INVERT); break; case '=': add(next('=') ? EQUAL: SET); break; case '*': add(next('=') ? MUL_SET: MUL); break; case '/': handleSlash(); break; case '%': add(next('=') ? MOD_SET: MOD); break; case '&': add(next('=') ? BIT_AND_SET: (next('&') ? AND: BIT_AND)); break; case '|': add(next('=') ? BIT_OR_SET: (next('|') ? OR: BIT_OR)); break; case '^': add(next('=') ? BIT_XOR_SET: BIT_XOR); break; case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS); break; case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER); break; default: throw PreScriptException("unknown token " + c, line); } } static void handleChar(int c) { if(isLetter(c) || c == '_' || c == '.') { handleLiteral(c, TokenType::LITERAL); } else if(isDigit(c)) { handleNumber(c); } else { handleSpecial(c); } } void Tokenizer::tokenize(std::vector& inTokens, std::istream& inInput) { tokens = &inTokens; input = &inInput; line = 1; buffer = -1; int c; while((c = next()) != -1) { handleChar(c); } add(EOF_TOKEN); }