123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380 |
- #include <stdio.h>
- #include <uchar.h>
- #include <string.h>
- #include "tokenizer/Tokenizer.h"
- static FILE* input = NULL;
- static TokenStream* tokens = NULL;
- static unsigned int line = 1;
- static char32_t buffer = 0;
- static void tokenizer_onError(const char* message, unsigned int line) {
- printf("%s Line: %u\n", message, line);
- }
- static size_t tokenizer_printChar(char32_t c, char* buffer) {
- if(c <= 0x7F) {
- buffer[0] = (char) c;
- return 1;
- } else if(c < 0xE00000) {
- buffer[0] = (char) ((c >> 8) & 0xFF);
- buffer[1] = (char) ((c >> 0) & 0xFF);
- return 2;
- } else if(c <= 0xF0000000) {
- buffer[0] = (char) ((c >> 16) & 0xFF);
- buffer[1] = (char) ((c >> 8) & 0xFF);
- buffer[2] = (char) ((c >> 0) & 0xFF);
- return 3;
- }
- buffer[0] = (char) ((c >> 24) & 0xFF);
- buffer[1] = (char) ((c >> 16) & 0xFF);
- buffer[2] = (char) ((c >> 8) & 0xFF);
- buffer[3] = (char) ((c >> 0) & 0xFF);
- return 4;
- }
- static bool tokenizer_isLetter(char32_t c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
- }
- static bool tokenizer_isDigit(char32_t c) {
- return c >= '0' && c <= '9';
- }
- static bool tokenizer_isValidNameStart(char32_t c) {
- return tokenizer_isLetter(c) || c == '.' || c == '_';
- }
- static bool tokenizer_isValidNamePart(char32_t c) {
- return tokenizer_isDigit(c) || tokenizer_isValidNameStart(c);
- }
- static bool tokenizer_next(char32_t* c) {
- if(buffer != 0) {
- *c = buffer;
- buffer = 0;
- return true;
- }
- int in = fgetc(input);
- if(in == EOF) {
- return false;
- }
- if((in & 0x80) == 0) {
- *c = in;
- return true;
- }
- if((in >> 5) == 0x6) {
- *c = (in << 8) | fgetc(input);
- return true;
- }
- if((in >> 4) == 0xE) {
- *c = (in << 16) | (fgetc(input) << 8) | fgetc(input);
- return true;
- }
- if((in >> 3) == 0x1E) {
- *c = (in << 24) | (fgetc(input) << 16) | (fgetc(input) << 8) | fgetc(input);
- return true;
- }
- return true;
- }
- static bool tokenizer_peek(char32_t* c) {
- if(buffer != 0 || tokenizer_next(&buffer)) {
- *c = buffer;
- return true;
- }
- return false;
- }
- static bool tokenizer_nextIf(char32_t c) {
- char32_t nextChar;
- if(tokenizer_peek(&nextChar) && c == nextChar) {
- tokenizer_next(&nextChar);
- return true;
- }
- return false;
- }
- static void tokenizer_addToken(Token token) {
- tokens->add(token, line);
- }
- static void tokenizer_addStringToken(Token token, const char* text) {
- tokens->add(token, line, text);
- }
- static Token tokenizer_chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) {
- if(tokenizer_nextIf(c)) {
- if(tokenizer_nextIf('=')) {
- return aCharEqual;
- }
- return aChar;
- } else if(tokenizer_nextIf('=')) {
- return aEqual;
- }
- return other;
- }
- static bool tokenizer_handleLiteral(char32_t c, Token token) {
- const size_t bufferSize = 1024;
- char buffer[bufferSize];
- size_t index = 1;
- buffer[0] = c;
- while(index < bufferSize - 1) {
- char32_t data;
- if(!tokenizer_peek(&data) || !tokenizer_isValidNamePart(data)) {
- break;
- }
- buffer[index++] = data;
- tokenizer_next(&data);
- }
- buffer[index] = '\0';
- if(strcmp(buffer, "if") == 0) {
- tokenizer_addToken(Token::IF);
- } else if(strcmp(buffer, "else") == 0) {
- tokenizer_addToken(Token::ELSE);
- } else if(strcmp(buffer, "elseif") == 0) {
- tokenizer_addToken(Token::ELSEIF);
- } else if(strcmp(buffer, "while") == 0) {
- tokenizer_addToken(Token::WHILE);
- } else if(strcmp(buffer, "try") == 0) {
- tokenizer_addToken(Token::TRY);
- } else if(strcmp(buffer, "catch") == 0) {
- tokenizer_addToken(Token::CATCH);
- } else if(strcmp(buffer, "for") == 0) {
- tokenizer_addToken(Token::FOR);
- } else if(strcmp(buffer, "function") == 0) {
- tokenizer_addToken(Token::FUNCTION);
- } else if(strcmp(buffer, "break") == 0) {
- tokenizer_addToken(Token::BREAK);
- } else if(strcmp(buffer, "continue") == 0) {
- tokenizer_addToken(Token::CONTINUE);
- } else if(strcmp(buffer, "return") == 0) {
- tokenizer_addToken(Token::RETURN);
- } else if(strcmp(buffer, "true") == 0) {
- tokenizer_addToken(Token::TRUE);
- } else if(strcmp(buffer, "false") == 0) {
- tokenizer_addToken(Token::FALSE);
- } else if(strcmp(buffer, "null") == 0) {
- tokenizer_addToken(Token::NULL_TOKEN);
- } else {
- tokenizer_addStringToken(token, buffer);
- }
- return false;
- }
- static bool tokenizer_handleNumber(char32_t c) {
- double number = c - '0';
- char32_t data;
- while(tokenizer_peek(&data)) {
- if(!tokenizer_isDigit(data)) {
- if(data != '.') {
- break;
- }
- tokenizer_next(&data);
- double factor = 10;
- while(tokenizer_peek(&data) && tokenizer_isDigit(data)) {
- number += (data - '0') / factor;
- factor *= 10;
- tokenizer_next(&data);
- }
- break;
- }
- number = (number * 10) + (data - '0');
- tokenizer_next(&data);
- }
- tokens->add(Token::NUMBER, line, number);
- return false;
- }
- static bool tokenizer_handleString() {
- const size_t bufferSize = 1024;
- char buffer[bufferSize];
- size_t index = 0;
- unsigned int oldLine = line;
- while(index + 4 < bufferSize) {
- char32_t data;
- if(!tokenizer_next(&data)) {
- tokenizer_onError("non closed string literal", oldLine);
- return true;
- }
- if(data == '"') {
- buffer[index] = '\0';
- tokenizer_addStringToken(Token::STRING, buffer);
- return false;
- }
- if(data == '\n') {
- line++;
- }
- if(data == '\\') {
- char32_t escape;
- if(!tokenizer_next(&escape)) {
- tokenizer_onError("missing escaped character", line);
- return true;
- }
- switch(escape) {
- case 'n': data = '\n';
- break;
- case '\\': data = '\\';
- break;
- case '"': data = '"';
- break;
- default:
- tokenizer_onError("invalid escaped character", line);
- return true;
- }
- }
- index += tokenizer_printChar(data, buffer + index);
- }
- tokenizer_onError("string buffer to small", line);
- return true;
- }
- static bool tokenizer_handleOneLineComment() {
- char32_t data;
- while(tokenizer_next(&data) && data != '\n');
- line++;
- return false;
- }
- static bool tokenizer_handleMultiLineComment() {
- char32_t first;
- char32_t sec = 0;
- unsigned int oldLine = line;
- while(true) {
- first = sec;
- if(!tokenizer_next(&sec)) {
- tokenizer_onError("unclosed multiline comment", oldLine);
- return true;
- }
- if(first == '*' && sec == '/') {
- return false;
- }
- line += (sec == '\n');
- }
- }
- static bool tokenizer_handleSlash() {
- if(tokenizer_nextIf('/')) {
- return tokenizer_handleOneLineComment();
- } else if(tokenizer_nextIf('*')) {
- return tokenizer_handleMultiLineComment();
- } else if(tokenizer_nextIf('=')) {
- tokenizer_addToken(Token::DIV_SET);
- return false;
- }
- tokenizer_addToken(Token::DIV);
- return false;
- }
- static bool tokenizer_handleSpecial(char32_t c) {
- switch(c) {
- case ' ':
- case '\t':
- case '\r':
- return false;
- case '\n': line++;
- return false;
- case '"':
- return tokenizer_handleString();
- case '(': tokenizer_addToken(Token::OPEN_BRACKET);
- return false;
- case ')': tokenizer_addToken(Token::CLOSE_BRACKET);
- return false;
- case '[': tokenizer_addToken(Token::OPEN_SQUARE_BRACKET);
- return false;
- case ']': tokenizer_addToken(Token::CLOSE_SQUARE_BRACKET);
- return false;
- case '{': tokenizer_addToken(Token::OPEN_CURVED_BRACKET);
- return false;
- case '}': tokenizer_addToken(Token::CLOSE_CURVED_BRACKET);
- return false;
- case '$':
- return tokenizer_handleLiteral(c, Token::LITERAL);
- case '@':
- return tokenizer_handleLiteral(c, Token::LABEL);
- case ';': tokenizer_addToken(Token::SEMICOLON);
- return false;
- case ',': tokenizer_addToken(Token::COMMA);
- return false;
- case '~': tokenizer_addToken(Token::BIT_INVERT);
- return false;
- case '+': tokenizer_addToken(tokenizer_nextIf('=') ? Token::ADD_SET: (tokenizer_nextIf('+') ? Token::INC: Token::ADD));
- return false;
- case '-': tokenizer_addToken(tokenizer_nextIf('=') ? Token::SUB_SET: (tokenizer_nextIf('-') ? Token::DEC: Token::SUB));
- return false;
- case '!': tokenizer_addToken(tokenizer_nextIf('=') ? Token::NOT_EQUAL: Token::INVERT);
- break;
- case '=': tokenizer_addToken(tokenizer_nextIf('=') ? Token::EQUAL: Token::SET);
- return false;
- case '*': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MUL_SET: Token::MUL);
- return false;
- case '/':
- return tokenizer_handleSlash();
- case '%': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MOD_SET: Token::MOD);
- return false;
- case '&': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_AND_SET: (tokenizer_nextIf('&') ? Token::AND: Token::BIT_AND));
- return false;
- case '|': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_OR_SET: (tokenizer_nextIf('|') ? Token::OR: Token::BIT_OR));
- return false;
- case '^': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_XOR_SET: Token::BIT_XOR);
- return false;
- case '<': tokenizer_addToken(tokenizer_chooseToken('<', Token::LEFT_SHIFT_SET, Token::LEFT_SHIFT, Token::LESS_EQUAL, Token::LESS));
- return false;
- case '>': tokenizer_addToken(tokenizer_chooseToken('>', Token::RIGHT_SHIFT_SET, Token::RIGHT_SHIFT, Token::GREATER_EQUAL, Token::GREATER));
- return false;
- }
- char buffer[32];
- strncpy(buffer, "unknown token '", 32);
- size_t index = strlen(buffer);
- index += tokenizer_printChar(c, buffer + index);
- buffer[index] = '\'';
- buffer[index + 1] = '\0';
- tokenizer_onError(buffer, line);
- return true;
- }
- static bool tokenizer_handleChar(char32_t c) {
- if(tokenizer_isValidNameStart(c)) {
- return tokenizer_handleLiteral(c, Token::LITERAL);
- } else if(tokenizer_isDigit(c)) {
- return tokenizer_handleNumber(c);
- }
- return tokenizer_handleSpecial(c);
- }
- bool tokenize(TokenStream* tokenStream, const char* inputPath) {
- input = fopen(inputPath, "r");
- if(input == NULL) {
- return true;
- }
- tokens = tokenStream;
- line = 1;
- buffer = 0;
- char32_t c;
- while(tokenizer_next(&c)) {
- if(tokenizer_handleChar(c)) {
- return true;
- }
- }
- tokenizer_addToken(Token::EOF_TOKEN);
- fclose(input);
- input = NULL;
- return false;
- }
|