#include #include #include #include "tokenizer/Tokenizer.h" static FILE* input = NULL; static TokenStream* tokens = NULL; static unsigned int line = 1; static char32_t buffer = 0; static void tokenizer_onError(const char* message, unsigned int line) { printf("%s Line: %u\n", message, line); } static size_t tokenizer_printChar(char32_t c, char* buffer) { if(c <= 0x7F) { buffer[0] = (char) c; return 1; } else if(c < 0xE00000) { buffer[0] = (char) ((c >> 8) & 0xFF); buffer[1] = (char) ((c >> 0) & 0xFF); return 2; } else if(c <= 0xF0000000) { buffer[0] = (char) ((c >> 16) & 0xFF); buffer[1] = (char) ((c >> 8) & 0xFF); buffer[2] = (char) ((c >> 0) & 0xFF); return 3; } buffer[0] = (char) ((c >> 24) & 0xFF); buffer[1] = (char) ((c >> 16) & 0xFF); buffer[2] = (char) ((c >> 8) & 0xFF); buffer[3] = (char) ((c >> 0) & 0xFF); return 4; } static bool tokenizer_isLetter(char32_t c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool tokenizer_isDigit(char32_t c) { return c >= '0' && c <= '9'; } static bool tokenizer_isValidNameStart(char32_t c) { return tokenizer_isLetter(c) || c == '.' || c == '_'; } static bool tokenizer_isValidNamePart(char32_t c) { return tokenizer_isDigit(c) || tokenizer_isValidNameStart(c); } static bool tokenizer_next(char32_t* c) { if(buffer != 0) { *c = buffer; buffer = 0; return true; } int in = fgetc(input); if(in == EOF) { return false; } if((in & 0x80) == 0) { *c = in; return true; } if((in >> 5) == 0x6) { *c = (in << 8) | fgetc(input); return true; } if((in >> 4) == 0xE) { *c = (in << 16) | (fgetc(input) << 8) | fgetc(input); return true; } if((in >> 3) == 0x1E) { *c = (in << 24) | (fgetc(input) << 16) | (fgetc(input) << 8) | fgetc(input); return true; } return true; } static bool tokenizer_peek(char32_t* c) { if(buffer != 0 || tokenizer_next(&buffer)) { *c = buffer; return true; } return false; } static bool tokenizer_nextIf(char32_t c) { char32_t nextChar; if(tokenizer_peek(&nextChar) && c == nextChar) { tokenizer_next(&nextChar); return true; } return false; } static void tokenizer_addToken(Token token) { tokens->add(token, line); } static void tokenizer_addStringToken(Token token, const char* text) { tokens->add(token, line, text); } static Token tokenizer_chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) { if(tokenizer_nextIf(c)) { if(tokenizer_nextIf('=')) { return aCharEqual; } return aChar; } else if(tokenizer_nextIf('=')) { return aEqual; } return other; } static bool tokenizer_handleLiteral(char32_t c, Token token) { const size_t bufferSize = 1024; char buffer[bufferSize]; size_t index = 1; buffer[0] = c; while(index < bufferSize - 1) { char32_t data; if(!tokenizer_peek(&data) || !tokenizer_isValidNamePart(data)) { break; } buffer[index++] = data; tokenizer_next(&data); } buffer[index] = '\0'; if(strcmp(buffer, "if") == 0) { tokenizer_addToken(Token::IF); } else if(strcmp(buffer, "else") == 0) { tokenizer_addToken(Token::ELSE); } else if(strcmp(buffer, "elseif") == 0) { tokenizer_addToken(Token::ELSEIF); } else if(strcmp(buffer, "while") == 0) { tokenizer_addToken(Token::WHILE); } else if(strcmp(buffer, "try") == 0) { tokenizer_addToken(Token::TRY); } else if(strcmp(buffer, "catch") == 0) { tokenizer_addToken(Token::CATCH); } else if(strcmp(buffer, "for") == 0) { tokenizer_addToken(Token::FOR); } else if(strcmp(buffer, "function") == 0) { tokenizer_addToken(Token::FUNCTION); } else if(strcmp(buffer, "break") == 0) { tokenizer_addToken(Token::BREAK); } else if(strcmp(buffer, "continue") == 0) { tokenizer_addToken(Token::CONTINUE); } else if(strcmp(buffer, "return") == 0) { tokenizer_addToken(Token::RETURN); } else if(strcmp(buffer, "true") == 0) { tokenizer_addToken(Token::TRUE); } else if(strcmp(buffer, "false") == 0) { tokenizer_addToken(Token::FALSE); } else if(strcmp(buffer, "null") == 0) { tokenizer_addToken(Token::NULL_TOKEN); } else { tokenizer_addStringToken(token, buffer); } return false; } static bool tokenizer_handleNumber(char32_t c) { double number = c - '0'; char32_t data; while(tokenizer_peek(&data)) { if(!tokenizer_isDigit(data)) { if(data != '.') { break; } tokenizer_next(&data); double factor = 10; while(tokenizer_peek(&data) && tokenizer_isDigit(data)) { number += (data - '0') / factor; factor *= 10; tokenizer_next(&data); } break; } number = (number * 10) + (data - '0'); tokenizer_next(&data); } tokens->add(Token::NUMBER, line, number); return false; } static bool tokenizer_handleString() { const size_t bufferSize = 1024; char buffer[bufferSize]; size_t index = 0; unsigned int oldLine = line; while(index + 4 < bufferSize) { char32_t data; if(!tokenizer_next(&data)) { tokenizer_onError("non closed string literal", oldLine); return true; } if(data == '"') { buffer[index] = '\0'; tokenizer_addStringToken(Token::STRING, buffer); return false; } if(data == '\n') { line++; } if(data == '\\') { char32_t escape; if(!tokenizer_next(&escape)) { tokenizer_onError("missing escaped character", line); return true; } switch(escape) { case 'n': data = '\n'; break; case '\\': data = '\\'; break; case '"': data = '"'; break; default: tokenizer_onError("invalid escaped character", line); return true; } } index += tokenizer_printChar(data, buffer + index); } tokenizer_onError("string buffer to small", line); return true; } static bool tokenizer_handleOneLineComment() { char32_t data; while(tokenizer_next(&data) && data != '\n'); line++; return false; } static bool tokenizer_handleMultiLineComment() { char32_t first; char32_t sec = 0; unsigned int oldLine = line; while(true) { first = sec; if(!tokenizer_next(&sec)) { tokenizer_onError("unclosed multiline comment", oldLine); return true; } if(first == '*' && sec == '/') { return false; } line += (sec == '\n'); } } static bool tokenizer_handleSlash() { if(tokenizer_nextIf('/')) { return tokenizer_handleOneLineComment(); } else if(tokenizer_nextIf('*')) { return tokenizer_handleMultiLineComment(); } else if(tokenizer_nextIf('=')) { tokenizer_addToken(Token::DIV_SET); return false; } tokenizer_addToken(Token::DIV); return false; } static bool tokenizer_handleSpecial(char32_t c) { switch(c) { case ' ': case '\t': case '\r': return false; case '\n': line++; return false; case '"': return tokenizer_handleString(); case '(': tokenizer_addToken(Token::OPEN_BRACKET); return false; case ')': tokenizer_addToken(Token::CLOSE_BRACKET); return false; case '[': tokenizer_addToken(Token::OPEN_SQUARE_BRACKET); return false; case ']': tokenizer_addToken(Token::CLOSE_SQUARE_BRACKET); return false; case '{': tokenizer_addToken(Token::OPEN_CURVED_BRACKET); return false; case '}': tokenizer_addToken(Token::CLOSE_CURVED_BRACKET); return false; case '$': return tokenizer_handleLiteral(c, Token::LITERAL); case '@': return tokenizer_handleLiteral(c, Token::LABEL); case ';': tokenizer_addToken(Token::SEMICOLON); return false; case ',': tokenizer_addToken(Token::COMMA); return false; case '~': tokenizer_addToken(Token::BIT_INVERT); return false; case '+': tokenizer_addToken(tokenizer_nextIf('=') ? Token::ADD_SET: (tokenizer_nextIf('+') ? Token::INC: Token::ADD)); return false; case '-': tokenizer_addToken(tokenizer_nextIf('=') ? Token::SUB_SET: (tokenizer_nextIf('-') ? Token::DEC: Token::SUB)); return false; case '!': tokenizer_addToken(tokenizer_nextIf('=') ? Token::NOT_EQUAL: Token::INVERT); break; case '=': tokenizer_addToken(tokenizer_nextIf('=') ? Token::EQUAL: Token::SET); return false; case '*': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MUL_SET: Token::MUL); return false; case '/': return tokenizer_handleSlash(); case '%': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MOD_SET: Token::MOD); return false; case '&': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_AND_SET: (tokenizer_nextIf('&') ? Token::AND: Token::BIT_AND)); return false; case '|': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_OR_SET: (tokenizer_nextIf('|') ? Token::OR: Token::BIT_OR)); return false; case '^': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_XOR_SET: Token::BIT_XOR); return false; case '<': tokenizer_addToken(tokenizer_chooseToken('<', Token::LEFT_SHIFT_SET, Token::LEFT_SHIFT, Token::LESS_EQUAL, Token::LESS)); return false; case '>': tokenizer_addToken(tokenizer_chooseToken('>', Token::RIGHT_SHIFT_SET, Token::RIGHT_SHIFT, Token::GREATER_EQUAL, Token::GREATER)); return false; } char buffer[32]; strncpy(buffer, "unknown token '", 32); size_t index = strlen(buffer); index += tokenizer_printChar(c, buffer + index); buffer[index] = '\''; buffer[index + 1] = '\0'; tokenizer_onError(buffer, line); return true; } static bool tokenizer_handleChar(char32_t c) { if(tokenizer_isValidNameStart(c)) { return tokenizer_handleLiteral(c, Token::LITERAL); } else if(tokenizer_isDigit(c)) { return tokenizer_handleNumber(c); } return tokenizer_handleSpecial(c); } bool tokenize(TokenStream* tokenStream, const char* inputPath) { input = fopen(inputPath, "r"); if(input == NULL) { return true; } tokens = tokenStream; line = 1; buffer = 0; char32_t c; while(tokenizer_next(&c)) { if(tokenizer_handleChar(c)) { return true; } } tokenizer_addToken(Token::EOF_TOKEN); fclose(input); input = NULL; return false; }