123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- #include <sstream>
- #include "tokenizer/Tokenizer.h"
- static unsigned int line = 1;
- static TokenStream* tokens = nullptr;
- static Tokenizer::i32stream* input = nullptr;
- static char32_t buffer = 0;
- static void onError(const std::string& message, unsigned int line) {
- std::cout << message << " Line: " << line << std::endl;
- }
- static void convertChar(char32_t c, char* buffer) {
- if(c <= 0x7F) {
- buffer[0] = (char) c;
- buffer[1] = '\0';
- } else if(c <= 0x7FF) {
- buffer[0] = (char) (0xC0 | ((c >> 6) & 0x1F));
- buffer[1] = (char) (0x80 | ((c >> 0) & 0x3F));
- buffer[2] = '\0';
- } else if(c <= 0xFFFF) {
- buffer[0] = (char) (0xE0 | ((c >> 12) & 0x0F));
- buffer[1] = (char) (0x80 | ((c >> 6) & 0x3F));
- buffer[2] = (char) (0x80 | ((c >> 0) & 0x3F));
- buffer[3] = '\0';
- } else {
- buffer[0] = (char) (0xF0 | ((c >> 18) & 0x07));
- buffer[1] = (char) (0x80 | ((c >> 12) & 0x3F));
- buffer[2] = (char) (0x80 | ((c >> 6) & 0x3F));
- buffer[3] = (char) (0x80 | ((c >> 0) & 0x3F));
- buffer[4] = '\0';
- }
- }
- static bool isLetter(char32_t c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
- }
- static bool isDigit(char32_t c) {
- return c >= '0' && c <= '9';
- }
- static bool isValidNameStart(char32_t c) {
- return isLetter(c) || c == '.' || c == '_';
- }
- static bool isValidNamePart(char32_t c) {
- return isDigit(c) || isValidNameStart(c);
- }
- static bool next(char32_t& c) {
- if(buffer != 0) {
- c = buffer;
- buffer = 0;
- return true;
- }
- c = input->get();
- return input->good();
- }
- static bool peek(char32_t& c) {
- if(buffer != 0 || next(buffer)) {
- c = buffer;
- return true;
- }
- return false;
- }
- static bool nextIf(char32_t c) {
- char32_t nextChar;
- if(peek(nextChar) && c == nextChar) {
- next(nextChar);
- return true;
- }
- return false;
- }
- static void add(TokenType type) {
- tokens->add(type, line);
- }
- static void add(TokenType type, const std::string& text) {
- tokens->add(type, line, text);
- }
- static TokenType chooseTokenType(char c, TokenType aCharEqual, TokenType aChar, TokenType aEqual, TokenType other) {
- if(nextIf(c)) {
- if(nextIf('=')) {
- return aCharEqual;
- }
- return aChar;
- } else if(nextIf('=')) {
- return aEqual;
- }
- return other;
- }
- static bool handleLiteral(char32_t c, TokenType type) {
- std::stringstream sBuilder;
- sBuilder << (char) c;
- while(true) {
- char32_t data;
- if(!peek(data) || !isValidNamePart(data)) {
- break;
- }
- sBuilder << (char) data;
- next(data);
- }
- std::string s = sBuilder.str();
- if(s == "if") {
- add(TokenType::IF);
- } else if(s == "if") {
- add(TokenType::IF);
- } else if(s == "else") {
- add(TokenType::ELSE);
- } else if(s == "elseif") {
- add(TokenType::ELSEIF);
- } else if(s == "while") {
- add(TokenType::WHILE);
- } else if(s == "try") {
- add(TokenType::TRY);
- } else if(s == "catch") {
- add(TokenType::CATCH);
- } else if(s == "for") {
- add(TokenType::FOR);
- } else if(s == "function") {
- add(TokenType::FUNCTION);
- } else if(s == "break") {
- add(TokenType::BREAK);
- } else if(s == "continue") {
- add(TokenType::CONTINUE);
- } else if(s == "return") {
- add(TokenType::RETURN);
- } else if(s == "true") {
- add(TokenType::TRUE);
- } else if(s == "false") {
- add(TokenType::FALSE);
- } else if(s == "null") {
- add(TokenType::NULL_TOKEN);
- } else {
- add(type, s);
- }
- return false;
- }
- static bool handleNumber(char32_t c) {
- double number = c - '0';
- char32_t data;
- while(peek(data)) {
- if(!isDigit(data)) {
- if(data != '.') {
- break;
- }
- next(data);
- double factor = 10;
- while(peek(data) && isDigit(data)) {
- number += (data - '0') / factor;
- factor *= 10;
- next(data);
- }
- break;
- }
- number = (number * 10) + (data - '0');
- next(data);
- }
- tokens->add(NUMBER, line, number);
- return false;
- }
- static bool handleString() {
- std::stringstream ss;
- unsigned int oldLine = line;
- while(true) {
- char32_t data;
- if(!next(data)) {
- onError("non closed string literal", oldLine);
- return true;
- }
- if(data == '"') {
- add(STRING, ss.str());
- return false;
- }
- if(data == '\n') {
- line++;
- }
- if(data == '\\') {
- char32_t escape;
- if(!next(escape)) {
- onError("missing escaped character", line);
- return true;
- }
- switch(escape) {
- case 'n': data = '\n';
- break;
- case '\\': data = '\\';
- break;
- case '"': data = '"';
- break;
- default:
- onError("invalid escaped character", line);
- return true;
- }
- }
- char buffer[5];
- convertChar(data, buffer);
- ss << buffer;
- }
- }
- static bool handleOneLineComment() {
- char32_t data;
- while(next(data) && data != '\n');
- line++;
- return false;
- }
- static bool handleMultiLineComment() {
- char32_t first;
- char32_t sec = 0;
- unsigned int oldLine = line;
- while(true) {
- first = sec;
- if(!next(sec)) {
- onError("unclosed multiline comment", oldLine);
- return true;
- }
- if(first == '*' && sec == '/') {
- return false;
- }
- line += (sec == '\n');
- }
- }
- static bool handleSlash() {
- if(nextIf('/')) {
- return handleOneLineComment();
- } else if(nextIf('*')) {
- return handleMultiLineComment();
- } else if(nextIf('=')) {
- add(DIV_SET);
- return false;
- }
- add(DIV);
- return false;
- }
- static bool handleSpecial(char32_t c) {
- switch(c) {
- case ' ':
- case '\t':
- case '\r':
- return false;
- case '\n': line++;
- return false;
- case '"':
- return handleString();
- case '(': add(OPEN_BRACKET);
- return false;
- case ')': add(CLOSE_BRACKET);
- return false;
- case '[': add(OPEN_SQUARE_BRACKET);
- return false;
- case ']': add(CLOSE_SQUARE_BRACKET);
- return false;
- case '{': add(OPEN_CURVED_BRACKET);
- return false;
- case '}': add(CLOSE_CURVED_BRACKET);
- return false;
- case '$':
- return handleLiteral(c, LITERAL);
- case '@':
- return handleLiteral(c, LABEL);
- case ';': add(SEMICOLON);
- return false;
- case ',': add(COMMA);
- return false;
- case '~': add(BIT_INVERT);
- return false;
- case '+': add(nextIf('=') ? ADD_SET: (nextIf('+') ? INC: ADD));
- return false;
- case '-': add(nextIf('=') ? SUB_SET: (nextIf('-') ? DEC: SUB));
- return false;
- case '!': add(nextIf('=') ? NOT_EQUAL: INVERT);
- break;
- case '=': add(nextIf('=') ? EQUAL: SET);
- return false;
- case '*': add(nextIf('=') ? MUL_SET: MUL);
- return false;
- case '/':
- return handleSlash();
- case '%': add(nextIf('=') ? MOD_SET: MOD);
- return false;
- case '&': add(nextIf('=') ? BIT_AND_SET: (nextIf('&') ? AND: BIT_AND));
- return false;
- case '|': add(nextIf('=') ? BIT_OR_SET: (nextIf('|') ? OR: BIT_OR));
- return false;
- case '^': add(nextIf('=') ? BIT_XOR_SET: BIT_XOR);
- return false;
- case '<': add(chooseTokenType('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS));
- return false;
- case '>': add(chooseTokenType('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER));
- return false;
- }
- char buffer[5];
- convertChar(c, buffer);
- onError(std::string("unknown token '") + buffer + "'", line);
- return true;
- }
- static bool handleChar(char32_t c) {
- if(isValidNameStart(c)) {
- return handleLiteral(c, TokenType::LITERAL);
- } else if(isDigit(c)) {
- return handleNumber(c);
- }
- return handleSpecial(c);
- }
- bool Tokenizer::tokenize(TokenStream& inTokens, i32stream& inInput) {
- tokens = &inTokens;
- input = &inInput;
- line = 1;
- buffer = 0;
- char32_t c;
- while(next(c)) {
- if(handleChar(c)) {
- return true;
- }
- }
- add(EOF_TOKEN);
- return false;
- }
|