123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389 |
- #include <sstream>
- #include "tokenizer/Tokenizer.h"
- #include "exceptions/PreScriptException.h"
- static unsigned int line = 1;
- static std::vector<Token>* tokens = nullptr;
- static std::istream* input = nullptr;
- static int buffer = -1;
- static bool isLetter(int c)
- {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
- }
- static bool isDigit(int c)
- {
- return c >= '0' && c <= '9';
- }
- static bool isValidNamePart(int c)
- {
- return isLetter(c) || isDigit(c) || c == '.' || c == '_';
- }
- static int next()
- {
- if(buffer != -1)
- {
- int r = buffer;
- buffer = -1;
- return r;
- }
- int data = input->get();
- if(!input->good())
- {
- return -1;
- }
- if((data & 0x80) != 0 && data != -1) // special char
- {
- if((data & 0x40) != 0) // this should always be true
- {
- if((data & 0x20) != 0) // 3 byte unicode
- {
- int a = input->get();
- int b = input->get();
- data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF);
- }
- else // 2 byte unicode
- {
- data = ((data & 0xFF) << 8) | (input->get() & 0xFF);
- }
- }
- else
- {
- // should not happen as unicode starts with 11
- }
- }
- return data;
- }
- static int peek()
- {
- if(buffer == -1)
- {
- buffer = next();
- return buffer;
- }
- return buffer;
- }
- static bool next(char c)
- {
- if(peek() == c)
- {
- next();
- return true;
- }
- return false;
- }
- static void add(TokenType type)
- {
- tokens->push_back(Token(type, line));
- }
- static void add(TokenType type, double number)
- {
- tokens->push_back(Token(type, line, number));
- }
- static void add(TokenType type, const std::string& text)
- {
- tokens->push_back(Token(type, line, text));
- }
- static void add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4)
- {
- int p = peek();
- if(p == c)
- {
- next();
- if(peek() == '=')
- {
- next();
- add(t1);
- }
- else
- {
- add(t2);
- }
- }
- else if(p == '=')
- {
- next();
- add(t3);
- }
- else
- {
- add(t4);
- }
- }
- static void handleLiteral(int c, TokenType type)
- {
- std::stringstream sBuilder;
- sBuilder << (char) c;
- while(true)
- {
- int data = peek();
- if(!isValidNamePart(data))
- {
- break;
- }
- sBuilder << (char) data;
- next();
- }
- std::string s = sBuilder.str();
- if(s == "if") { add(TokenType::IF); }
- else if(s == "if") { add(TokenType::IF); }
- else if(s == "else") { add(TokenType::ELSE); }
- else if(s == "elseif") { add(TokenType::ELSEIF); }
- else if(s == "while") { add(TokenType::WHILE); }
- else if(s == "try") { add(TokenType::TRY); }
- else if(s == "catch") { add(TokenType::CATCH); }
- else if(s == "for") { add(TokenType::FOR); }
- else if(s == "function") { add(TokenType::FUNCTION); }
- else if(s == "break") { add(TokenType::BREAK); }
- else if(s == "continue") { add(TokenType::CONTINUE); }
- else if(s == "return") { add(TokenType::RETURN); }
- else if(s == "true") { add(TokenType::TRUE); }
- else if(s == "false") { add(TokenType::FALSE); }
- else if(s == "null") { add(TokenType::NULL_TOKEN); }
- else { add(type, s); };
- }
- static void handleNumber(int c)
- {
- double d = c - '0';
- while(true)
- {
- int data = peek();
- if(!isDigit(data))
- {
- if(data == '.')
- {
- next();
- double factor = 10;
- while(true)
- {
- int data = peek();
- if(!isDigit(data))
- {
- break;
- }
- d += (data - '0') / factor;
- factor *= 10;
- next();
- }
- }
- break;
- }
- d = (d * 10) + (data - '0');
- next();
- }
- add(NUMBER, d);
- }
- static void handleString()
- {
- std::stringstream ss;
- int oldLine = line;
- while(true)
- {
- int data = next();
- if(data == -1)
- {
- throw PreScriptException("non closed string literal", oldLine);
- }
- if(data == '"')
- {
- add(STRING, ss.str());
- break;
- }
- if(data == '\n')
- {
- line++;
- }
- if(data == '\\')
- {
- int escape = next();
- switch(escape)
- {
- case 'n': data = '\n';
- break;
- case '\\': data = '\\';
- break;
- case '"': data = '"';
- break;
- default:
- throw PreScriptException("invalid escaped character", line);
- }
- }
- if(data > 0xFFFF)
- {
- ss << (char) ((data & 0xFF0000) >> 16);
- ss << (char) ((data & 0xFF00) >> 8);
- ss << (char) (data & 0xFF);
- }
- else if(data > 0xFF)
- {
- ss << (char) ((data & 0xFF00) >> 8);
- ss << (char) (data & 0xFF);
- }
- else
- {
- ss << (char) data;
- }
- }
- }
- static void handleOneLineComment()
- {
- while(true)
- {
- int data = next();
- if(data == -1 || data == '\n')
- {
- line++;
- break;
- }
- }
- }
- static void handleMultiLineComment()
- {
- int first;
- int sec = -1;
- while(true)
- {
- first = sec;
- sec = next();
- if(sec == -1 || (first == '*' && sec == '/'))
- {
- break;
- }
- if(sec == '\n')
- {
- line++;
- }
- }
- }
- static void handleSlash()
- {
- switch(peek())
- {
- case '/':
- next();
- handleOneLineComment();
- break;
- case '*':
- next();
- handleMultiLineComment();
- break;
- case '=':
- next();
- add(DIV_SET);
- break;
- default:
- add(DIV);
- }
- }
- static void handleSpecial(int c)
- {
- switch(c)
- {
- case ' ':
- case '\t':
- case '\r': break;
- case '\n': line++;
- break;
- case '"': handleString();
- break;
- case '(': add(OPEN_BRACKET);
- break;
- case ')': add(CLOSE_BRACKET);
- break;
- case '[': add(OPEN_SQUARE_BRACKET);
- break;
- case ']': add(CLOSE_SQUARE_BRACKET);
- break;
- case '{': add(OPEN_CURVED_BRACKET);
- break;
- case '}': add(CLOSE_CURVED_BRACKET);
- break;
- case '$': handleLiteral(c, LITERAL);
- break;
- case '@': handleLiteral(c, LABEL);
- break;
- case ';': add(SEMICOLON);
- break;
- case ',': add(COMMA);
- break;
- case '~': add(BIT_INVERT);
- break;
- case '+': add(next('=') ? ADD_SET: (next('+') ? INC: ADD));
- break;
- case '-': add(next('=') ? SUB_SET: (next('-') ? DEC: SUB));
- break;
- case '!': add(next('=') ? NOT_EQUAL: INVERT);
- break;
- case '=': add(next('=') ? EQUAL: SET);
- break;
- case '*': add(next('=') ? MUL_SET: MUL);
- break;
- case '/': handleSlash();
- break;
- case '%': add(next('=') ? MOD_SET: MOD);
- break;
- case '&': add(next('=') ? BIT_AND_SET: (next('&') ? AND: BIT_AND));
- break;
- case '|': add(next('=') ? BIT_OR_SET: (next('|') ? OR: BIT_OR));
- break;
- case '^': add(next('=') ? BIT_XOR_SET: BIT_XOR);
- break;
- case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS);
- break;
- case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER);
- break;
- default: throw PreScriptException("unknown token " + c, line);
- }
- }
- static void handleChar(int c)
- {
- if(isLetter(c) || c == '_' || c == '.')
- {
- handleLiteral(c, TokenType::LITERAL);
- }
- else if(isDigit(c))
- {
- handleNumber(c);
- }
- else
- {
- handleSpecial(c);
- }
- }
- void Tokenizer::tokenize(std::vector<Token>& inTokens, std::istream& inInput)
- {
- tokens = &inTokens;
- input = &inInput;
-
- line = 1;
- buffer = -1;
- int c;
- while((c = next()) != -1)
- {
- handleChar(c);
- }
- add(EOF_TOKEN);
- }
|