#include "Tokenizer.h" #include "../exceptions/PreScriptException.h" #include #include Tokenizer::Tokenizer() { } void Tokenizer::tokenize(vector>& tokens, vector>& streams) { Tokenizer::tokens = &tokens; Tokenizer::streams = &streams; for(streamIndex = 0; streamIndex < streams.size(); streamIndex++) { buffer = -1; line = 1; int c; while((c = next()) != -1) { handleChar(c); } } add(TokenType::EOF_TOKEN); } int Tokenizer::next() { if(buffer != -1) { int r = buffer; buffer = -1; return r; } istream& in = *(*streams)[streamIndex].get(); if(!in.good()) { return -1; } int data = in.get(); if((data & 0x80) != 0 && data != -1) // special char { if((data & 0x40) != 0) // this should always be true { if((data & 0x20) != 0) // 3 byte unicode { int a = in.get(); int b = in.get(); data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF); } else // 2 byte unicode { data = ((data & 0xFF) << 8) | (in.get() & 0xFF); } } else { // should not happen as unicode starts with 11 } } return data; } int Tokenizer::peek() { if(buffer == -1) { buffer = next(); return buffer; } return buffer; } bool Tokenizer::next(char c) { if(peek() == c) { next(); return true; } return false; } void Tokenizer::add(TokenType type) { tokens->push_back(unique_ptr(new Token(type, line))); } void Tokenizer::add(TokenType type, double data) { tokens->push_back(unique_ptr(new DoubleToken(type, line, data))); } void Tokenizer::add(TokenType type, string data) { tokens->push_back(unique_ptr(new StringToken(type, line, data))); } void Tokenizer::add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4) { int peeked = peek(); if(peeked == c) { next(); if(peek() == '=') { next(); add(t1); } else { add(t2); } } else if(peeked == '=') { next(); add(t3); } else { add(t4); } } void Tokenizer::handleChar(int c) { if(isLetter(c) || c == '_' || c == '.') { handleLiteral(c, TokenType::LITERAL); } else if(isDigit(c)) { handleNumber(c); } else { handleSpecial(c); } } void Tokenizer::handleLiteral(int c, TokenType type) { stringstream ss; ss << (char) c; while(true) { int data = peek(); if(!isValidNamePart(data)) { break; } ss << (char) data; next(); } string s = ss.str(); if(s == "if") { add(TokenType::IF); } else if(s == "if") { add(TokenType::IF); } else if(s == "else") { add(TokenType::ELSE); } else if(s == "elseif") { add(TokenType::ELSEIF); } else if(s == "while") { add(TokenType::WHILE); } else if(s == "try") { add(TokenType::TRY); } else if(s == "catch") { add(TokenType::CATCH); } else if(s == "for") { add(TokenType::FOR); } else if(s == "function") { add(TokenType::FUNCTION); } else if(s == "break") { add(TokenType::BREAK); } else if(s == "continue") { add(TokenType::CONTINUE); } else if(s == "return") { add(TokenType::RETURN); } else if(s == "true") { add(TokenType::TRUE); } else if(s == "false") { add(TokenType::FALSE); } else if(s == "null") { add(TokenType::NULL_TOKEN); } else { add(type, s); }; } void Tokenizer::handleNumber(int c) { double d = c - '0'; while(true) { int data = peek(); if(!isDigit(data)) { if(data == '.') { next(); double factor = 10; while(true) { int data = peek(); if(!isDigit(data)) { break; } d += (data - '0') / factor; factor *= 10; next(); } } break; } d = (d * 10) + (data - '0'); next(); } add(NUMBER, d); } void Tokenizer::handleSpecial(int c) { switch(c) { case ' ': case '\t': case '\r': break; case '\n': line++; break; case '"': handleString(); break; case '(': add(OPEN_BRACKET); break; case ')': add(CLOSE_BRACKET); break; case '[': add(OPEN_SQUARE_BRACKET); break; case ']': add(CLOSE_SQUARE_BRACKET); break; case '{': add(OPEN_CURVED_BRACKET); break; case '}': add(CLOSE_CURVED_BRACKET); break; case '$': handleLiteral(c, LITERAL); break; case '@': handleLiteral(c, LABEL); break; case ';': add(SEMICOLON); break; case ',': add(COMMA); break; case '~': add(BIT_INVERT); break; case '+': add(next('=') ? ADD_SET : (next('+') ? INC : ADD)); break; case '-': add(next('=') ? SUB_SET : (next('-') ? DEC : SUB)); break; case '!': add(next('=') ? NOT_EQUAL : INVERT); break; case '=': add(next('=') ? EQUAL : SET); break; case '*': add(next('=') ? MUL_SET : MUL); break; case '/': handleSlash(); break; case '%': add(next('=') ? MOD_SET : MOD); break; case '&': add(next('=') ? BIT_AND_SET : (next('&') ? AND : BIT_AND)); break; case '|': add(next('=') ? BIT_OR_SET : (next('|') ? OR : BIT_OR)); break; case '^': add(next('=') ? BIT_XOR_SET : BIT_XOR); break; case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS); break; case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER); break; default: throw PreScriptException("unknown token " + c, line); } } void Tokenizer::handleString() { stringstream ss; while(true) { int data = next(); if(data == '"') { add(STRING, ss.str()); break; } if(data == '\n') { line++; } if(data > 0xFFFF) { ss << (char) ((data & 0xFF0000) >> 16); ss << (char) ((data & 0xFF00) >> 8); ss << (char) (data & 0xFF); } else if(data > 0xFF) { ss << (char) ((data & 0xFF00) >> 8); ss << (char) (data & 0xFF); } else { ss << (char) data; } } } void Tokenizer::handleSlash() { switch(peek()) { case '/': next(); handleOneLineComment(); break; case '*': next(); handleMultiLineComment(); break; case '=': next(); add(DIV_SET); break; default: add(DIV); } } void Tokenizer::handleOneLineComment() { while(true) { int data = next(); if(data == -1 || data == '\n') { line++; break; } } } void Tokenizer::handleMultiLineComment() { int first; int sec = -1; while(true) { first = sec; sec = next(); if(sec == -1 || (first == '*' && sec == '/')) { break; } if(sec == '\n') { line++; } } } bool Tokenizer::isLetter(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } bool Tokenizer::isDigit(int c) { return c >= '0' && c <= '9'; } bool Tokenizer::isValidNamePart(int c) { return isLetter(c) || isDigit(c) || c == '.' || c == '_'; }