#include #include "tokenizer/Tokenizer.h" static unsigned int line = 1; static TokenStream* tokens = nullptr; static Tokenizer::i32stream* input = nullptr; static char32_t buffer = 0; static void onError(const std::string& message, unsigned int line) { std::cout << message << " Line: " << line << std::endl; } static void convertChar(char32_t c, char* buffer) { if(c <= 0x7F) { buffer[0] = (char) c; buffer[1] = '\0'; } else if(c <= 0x7FF) { buffer[0] = (char) (0xC0 | ((c >> 6) & 0x1F)); buffer[1] = (char) (0x80 | ((c >> 0) & 0x3F)); buffer[2] = '\0'; } else if(c <= 0xFFFF) { buffer[0] = (char) (0xE0 | ((c >> 12) & 0x0F)); buffer[1] = (char) (0x80 | ((c >> 6) & 0x3F)); buffer[2] = (char) (0x80 | ((c >> 0) & 0x3F)); buffer[3] = '\0'; } else { buffer[0] = (char) (0xF0 | ((c >> 18) & 0x07)); buffer[1] = (char) (0x80 | ((c >> 12) & 0x3F)); buffer[2] = (char) (0x80 | ((c >> 6) & 0x3F)); buffer[3] = (char) (0x80 | ((c >> 0) & 0x3F)); buffer[4] = '\0'; } } static bool isLetter(char32_t c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool isDigit(char32_t c) { return c >= '0' && c <= '9'; } static bool isValidNameStart(char32_t c) { return isLetter(c) || c == '.' || c == '_'; } static bool isValidNamePart(char32_t c) { return isDigit(c) || isValidNameStart(c); } static bool next(char32_t& c) { if(buffer != 0) { c = buffer; buffer = 0; return true; } c = input->get(); return input->good(); } static bool peek(char32_t& c) { if(buffer != 0 || next(buffer)) { c = buffer; return true; } return false; } static bool nextIf(char32_t c) { char32_t nextChar; if(peek(nextChar) && c == nextChar) { next(nextChar); return true; } return false; } static void add(TokenType type) { tokens->add(type, line); } static void add(TokenType type, const std::string& text) { tokens->add(type, line, text); } static TokenType chooseTokenType(char c, TokenType aCharEqual, TokenType aChar, TokenType aEqual, TokenType other) { if(nextIf(c)) { if(nextIf('=')) { return aCharEqual; } return aChar; } else if(nextIf('=')) { return aEqual; } return other; } static bool handleLiteral(char32_t c, TokenType type) { std::stringstream sBuilder; sBuilder << (char) c; while(true) { char32_t data; if(!peek(data) || !isValidNamePart(data)) { break; } sBuilder << (char) data; next(data); } std::string s = sBuilder.str(); if(s == "if") { add(TokenType::IF); } else if(s == "if") { add(TokenType::IF); } else if(s == "else") { add(TokenType::ELSE); } else if(s == "elseif") { add(TokenType::ELSEIF); } else if(s == "while") { add(TokenType::WHILE); } else if(s == "try") { add(TokenType::TRY); } else if(s == "catch") { add(TokenType::CATCH); } else if(s == "for") { add(TokenType::FOR); } else if(s == "function") { add(TokenType::FUNCTION); } else if(s == "break") { add(TokenType::BREAK); } else if(s == "continue") { add(TokenType::CONTINUE); } else if(s == "return") { add(TokenType::RETURN); } else if(s == "true") { add(TokenType::TRUE); } else if(s == "false") { add(TokenType::FALSE); } else if(s == "null") { add(TokenType::NULL_TOKEN); } else { add(type, s); } return false; } static bool handleNumber(char32_t c) { double number = c - '0'; char32_t data; while(peek(data)) { if(!isDigit(data)) { if(data != '.') { break; } next(data); double factor = 10; while(peek(data) && isDigit(data)) { number += (data - '0') / factor; factor *= 10; next(data); } break; } number = (number * 10) + (data - '0'); next(data); } tokens->add(NUMBER, line, number); return false; } static bool handleString() { std::stringstream ss; unsigned int oldLine = line; while(true) { char32_t data; if(!next(data)) { onError("non closed string literal", oldLine); return true; } if(data == '"') { add(STRING, ss.str()); return false; } if(data == '\n') { line++; } if(data == '\\') { char32_t escape; if(!next(escape)) { onError("missing escaped character", line); return true; } switch(escape) { case 'n': data = '\n'; break; case '\\': data = '\\'; break; case '"': data = '"'; break; default: onError("invalid escaped character", line); return true; } } char buffer[5]; convertChar(data, buffer); ss << buffer; } } static bool handleOneLineComment() { char32_t data; while(next(data) && data != '\n'); line++; return false; } static bool handleMultiLineComment() { char32_t first; char32_t sec = 0; unsigned int oldLine = line; while(true) { first = sec; if(!next(sec)) { onError("unclosed multiline comment", oldLine); return true; } if(first == '*' && sec == '/') { return false; } line += (sec == '\n'); } } static bool handleSlash() { if(nextIf('/')) { return handleOneLineComment(); } else if(nextIf('*')) { return handleMultiLineComment(); } else if(nextIf('=')) { add(DIV_SET); return false; } add(DIV); return false; } static bool handleSpecial(char32_t c) { switch(c) { case ' ': case '\t': case '\r': return false; case '\n': line++; return false; case '"': return handleString(); case '(': add(OPEN_BRACKET); return false; case ')': add(CLOSE_BRACKET); return false; case '[': add(OPEN_SQUARE_BRACKET); return false; case ']': add(CLOSE_SQUARE_BRACKET); return false; case '{': add(OPEN_CURVED_BRACKET); return false; case '}': add(CLOSE_CURVED_BRACKET); return false; case '$': return handleLiteral(c, LITERAL); case '@': return handleLiteral(c, LABEL); case ';': add(SEMICOLON); return false; case ',': add(COMMA); return false; case '~': add(BIT_INVERT); return false; case '+': add(nextIf('=') ? ADD_SET: (nextIf('+') ? INC: ADD)); return false; case '-': add(nextIf('=') ? SUB_SET: (nextIf('-') ? DEC: SUB)); return false; case '!': add(nextIf('=') ? NOT_EQUAL: INVERT); break; case '=': add(nextIf('=') ? EQUAL: SET); return false; case '*': add(nextIf('=') ? MUL_SET: MUL); return false; case '/': return handleSlash(); case '%': add(nextIf('=') ? MOD_SET: MOD); return false; case '&': add(nextIf('=') ? BIT_AND_SET: (nextIf('&') ? AND: BIT_AND)); return false; case '|': add(nextIf('=') ? BIT_OR_SET: (nextIf('|') ? OR: BIT_OR)); return false; case '^': add(nextIf('=') ? BIT_XOR_SET: BIT_XOR); return false; case '<': add(chooseTokenType('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS)); return false; case '>': add(chooseTokenType('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER)); return false; } char buffer[5]; convertChar(c, buffer); onError(std::string("unknown token '") + buffer + "'", line); return true; } static bool handleChar(char32_t c) { if(isValidNameStart(c)) { return handleLiteral(c, TokenType::LITERAL); } else if(isDigit(c)) { return handleNumber(c); } return handleSpecial(c); } bool Tokenizer::tokenize(TokenStream& inTokens, i32stream& inInput) { tokens = &inTokens; input = &inInput; line = 1; buffer = 0; char32_t c; while(next(c)) { if(handleChar(c)) { return true; } } add(EOF_TOKEN); return false; }