123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- #include <iostream>
- #include <fstream>
- #include "tokenizer/Tokenizer.h"
- #include "utils/String.h"
- static void onError(const String& message, unsigned int line) {
- std::cout << message << " Line: " << line << "\n";
- }
- static bool isLetter(char32_t c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
- }
- static bool isDigit(char32_t c) {
- return c >= '0' && c <= '9';
- }
- static bool isValidNameStart(char32_t c) {
- return isLetter(c) || c == '.' || c == '_';
- }
- static bool isValidNamePart(char32_t c) {
- return isDigit(c) || isValidNameStart(c);
- }
- class Data {
- public:
- Data(const char* inputPath, TokenStream& tokens) : tokens(tokens) {
- stream.open(inputPath);
- }
- bool hasFileError() {
- return !stream.good();
- }
- bool next(char32_t& c) {
- if(buffer != 0) {
- c = buffer;
- buffer = 0;
- return true;
- }
- c = stream.get();
- return stream.good();
- }
- bool peek(char32_t& c) {
- if(buffer != 0 || next(buffer)) {
- c = buffer;
- return true;
- }
- return false;
- }
- bool nextIf(char32_t c) {
- char32_t nextChar;
- if(peek(nextChar) && c == nextChar) {
- next(nextChar);
- return true;
- }
- return false;
- }
- void addToken(Token token) {
- tokens.add(token, line);
- }
- void addToken(Token token, const char* text) {
- tokens.add(token, line, text);
- }
- Token chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) {
- if(nextIf(c)) {
- if(nextIf('=')) {
- return aCharEqual;
- }
- return aChar;
- } else if(nextIf('=')) {
- return aEqual;
- }
- return other;
- }
- bool handleLiteral(char32_t c, Token token) {
- String s;
- s += (char) c;
- while(true) {
- if(s.isFull()) {
- onError("string buffer to small", line);
- return true;
- }
- char32_t data;
- if(!peek(data) || !isValidNamePart(data)) {
- break;
- }
- s += (char) data;
- next(data);
- }
- if(s == "if") {
- addToken(Token::IF);
- } else if(s == "else") {
- addToken(Token::ELSE);
- } else if(s == "elseif") {
- addToken(Token::ELSEIF);
- } else if(s == "while") {
- addToken(Token::WHILE);
- } else if(s == "try") {
- addToken(Token::TRY);
- } else if(s == "catch") {
- addToken(Token::CATCH);
- } else if(s == "for") {
- addToken(Token::FOR);
- } else if(s == "function") {
- addToken(Token::FUNCTION);
- } else if(s == "break") {
- addToken(Token::BREAK);
- } else if(s == "continue") {
- addToken(Token::CONTINUE);
- } else if(s == "return") {
- addToken(Token::RETURN);
- } else if(s == "true") {
- addToken(Token::TRUE);
- } else if(s == "false") {
- addToken(Token::FALSE);
- } else if(s == "null") {
- addToken(Token::NULL_TOKEN);
- } else {
- addToken(token, s);
- }
- return false;
- }
- bool handleNumber(char32_t c) {
- double number = c - '0';
- char32_t data;
- while(peek(data)) {
- if(!isDigit(data)) {
- if(data != '.') {
- break;
- }
- next(data);
- double factor = 10;
- while(peek(data) && isDigit(data)) {
- number += (data - '0') / factor;
- factor *= 10;
- next(data);
- }
- break;
- }
- number = (number * 10) + (data - '0');
- next(data);
- }
- tokens.add(Token::NUMBER, line, number);
- return false;
- }
- bool handleString() {
- String s;
- unsigned int oldLine = line;
- while(!s.isFull()) {
- char32_t data;
- if(!next(data)) {
- onError("non closed string literal", oldLine);
- return true;
- }
- if(data == '"') {
- addToken(Token::STRING, s);
- return false;
- }
- if(data == '\n') {
- line++;
- }
- if(data == '\\') {
- char32_t escape;
- if(!next(escape)) {
- onError("missing escaped character", line);
- return true;
- }
- switch(escape) {
- case 'n': data = '\n';
- break;
- case '\\': data = '\\';
- break;
- case '"': data = '"';
- break;
- default:
- onError("invalid escaped character", line);
- return true;
- }
- }
- s += data;
- }
- onError("string buffer to small", line);
- return true;
- }
- bool handleOneLineComment() {
- char32_t data;
- while(next(data) && data != '\n');
- line++;
- return false;
- }
- bool handleMultiLineComment() {
- char32_t first;
- char32_t sec = 0;
- unsigned int oldLine = line;
- while(true) {
- first = sec;
- if(!next(sec)) {
- onError("unclosed multiline comment", oldLine);
- return true;
- }
- if(first == '*' && sec == '/') {
- return false;
- }
- line += (sec == '\n');
- }
- }
- bool handleSlash() {
- if(nextIf('/')) {
- return handleOneLineComment();
- } else if(nextIf('*')) {
- return handleMultiLineComment();
- } else if(nextIf('=')) {
- addToken(Token::DIV_SET);
- return false;
- }
- addToken(Token::DIV);
- return false;
- }
- bool handleSpecial(char32_t c) {
- switch(c) {
- case ' ':
- case '\t':
- case '\r':
- return false;
- case '\n': line++;
- return false;
- case '"':
- return handleString();
- case '(': addToken(Token::OPEN_BRACKET);
- return false;
- case ')': addToken(Token::CLOSE_BRACKET);
- return false;
- case '[': addToken(Token::OPEN_SQUARE_BRACKET);
- return false;
- case ']': addToken(Token::CLOSE_SQUARE_BRACKET);
- return false;
- case '{': addToken(Token::OPEN_CURVED_BRACKET);
- return false;
- case '}': addToken(Token::CLOSE_CURVED_BRACKET);
- return false;
- case '$':
- return handleLiteral(c, Token::LITERAL);
- case '@':
- return handleLiteral(c, Token::LABEL);
- case ';': addToken(Token::SEMICOLON);
- return false;
- case ',': addToken(Token::COMMA);
- return false;
- case '~': addToken(Token::BIT_INVERT);
- return false;
- case '+': addToken(nextIf('=') ? Token::ADD_SET: (nextIf('+') ? Token::INC: Token::ADD));
- return false;
- case '-': addToken(nextIf('=') ? Token::SUB_SET: (nextIf('-') ? Token::DEC: Token::SUB));
- return false;
- case '!': addToken(nextIf('=') ? Token::NOT_EQUAL: Token::INVERT);
- break;
- case '=': addToken(nextIf('=') ? Token::EQUAL: Token::SET);
- return false;
- case '*': addToken(nextIf('=') ? Token::MUL_SET: Token::MUL);
- return false;
- case '/':
- return handleSlash();
- case '%': addToken(nextIf('=') ? Token::MOD_SET: Token::MOD);
- return false;
- case '&': addToken(nextIf('=') ? Token::BIT_AND_SET: (nextIf('&') ? Token::AND: Token::BIT_AND));
- return false;
- case '|': addToken(nextIf('=') ? Token::BIT_OR_SET: (nextIf('|') ? Token::OR: Token::BIT_OR));
- return false;
- case '^': addToken(nextIf('=') ? Token::BIT_XOR_SET: Token::BIT_XOR);
- return false;
- case '<': addToken(chooseToken('<', Token::LEFT_SHIFT_SET, Token::LEFT_SHIFT, Token::LESS_EQUAL, Token::LESS));
- return false;
- case '>': addToken(chooseToken('>', Token::RIGHT_SHIFT_SET, Token::RIGHT_SHIFT, Token::GREATER_EQUAL, Token::GREATER));
- return false;
- }
- String s("unknown token '");
- s += c;
- s += '\'';
- onError(s, line);
- return true;
- }
- bool handleChar(char32_t c) {
- if(isValidNameStart(c)) {
- return handleLiteral(c, Token::LITERAL);
- } else if(isDigit(c)) {
- return handleNumber(c);
- }
- return handleSpecial(c);
- }
- private:
- std::basic_ifstream<char32_t> stream;
- TokenStream& tokens;
- unsigned int line = 1;
- char32_t buffer = 0;
- };
- bool Tokenizer::tokenize(TokenStream& tokenStream, const char* inputPath) {
- Data d(inputPath, tokenStream);
- if(d.hasFileError()) {
- return true;
- }
- char32_t c;
- while(d.next(c)) {
- if(d.handleChar(c)) {
- return true;
- }
- }
- d.addToken(Token::EOF_TOKEN);
- return false;
- }
|