|
@@ -0,0 +1,385 @@
|
|
|
+#include <stdio.h>
|
|
|
+#include <uchar.h>
|
|
|
+#include <string.h>
|
|
|
+
|
|
|
+#include "tokenizer/Tokenizer.h"
|
|
|
+
|
|
|
+static FILE* input = NULL;
|
|
|
+static TokenStream* tokens = NULL;
|
|
|
+static unsigned int line = 1;
|
|
|
+static char32_t buffer = 0;
|
|
|
+
|
|
|
+static void tokenizer_onError(const char* message, unsigned int line) {
|
|
|
+ printf("%s Line: %u\n", message, line);
|
|
|
+}
|
|
|
+
|
|
|
+static size_t tokenizer_printChar(char32_t c, char* buffer) {
|
|
|
+ if(c <= 0x7F) {
|
|
|
+ buffer[0] = (char) c;
|
|
|
+ return 1;
|
|
|
+ } else if(c < 0xE00000) {
|
|
|
+ buffer[0] = (char) ((c >> 8) & 0xFF);
|
|
|
+ buffer[1] = (char) ((c >> 0) & 0xFF);
|
|
|
+ return 2;
|
|
|
+ } else if(c <= 0xF0000000) {
|
|
|
+ buffer[0] = (char) ((c >> 16) & 0xFF);
|
|
|
+ buffer[1] = (char) ((c >> 8) & 0xFF);
|
|
|
+ buffer[2] = (char) ((c >> 0) & 0xFF);
|
|
|
+ return 3;
|
|
|
+ }
|
|
|
+ buffer[0] = (char) ((c >> 24) & 0xFF);
|
|
|
+ buffer[1] = (char) ((c >> 16) & 0xFF);
|
|
|
+ buffer[2] = (char) ((c >> 8) & 0xFF);
|
|
|
+ buffer[3] = (char) ((c >> 0) & 0xFF);
|
|
|
+ return 4;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_isLetter(char32_t c) {
|
|
|
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_isDigit(char32_t c) {
|
|
|
+ return c >= '0' && c <= '9';
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_isValidNameStart(char32_t c) {
|
|
|
+ return tokenizer_isLetter(c) || c == '.' || c == '_';
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_isValidNamePart(char32_t c) {
|
|
|
+ return tokenizer_isDigit(c) || tokenizer_isValidNameStart(c);
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_next(char32_t* c) {
|
|
|
+ if(buffer != 0) {
|
|
|
+ *c = buffer;
|
|
|
+ buffer = 0;
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ int in = fgetc(input);
|
|
|
+ if(in == EOF) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ if((in & 0x80) == 0) {
|
|
|
+ *c = in;
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if((in >> 5) == 0x6) {
|
|
|
+ *c = (in << 8) | fgetc(input);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if((in >> 4) == 0xE) {
|
|
|
+ *c = (in << 16) | (fgetc(input) << 8) | fgetc(input);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if((in >> 3) == 0x1E) {
|
|
|
+ *c = (in << 24) | (fgetc(input) << 16) | (fgetc(input) << 8) | fgetc(input);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_peek(char32_t* c) {
|
|
|
+ if(buffer != 0 || tokenizer_next(&buffer)) {
|
|
|
+ *c = buffer;
|
|
|
+
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_nextIf(char32_t c) {
|
|
|
+ char32_t nextChar;
|
|
|
+ if(tokenizer_peek(&nextChar) && c == nextChar) {
|
|
|
+ tokenizer_next(&nextChar);
|
|
|
+
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static void tokenizer_addToken(Token token) {
|
|
|
+
|
|
|
+ addToken(tokens, token, line);
|
|
|
+}
|
|
|
+
|
|
|
+static void tokenizer_addStringToken(Token token, const char* text) {
|
|
|
+
|
|
|
+ addStringToken(tokens, token, line, text);
|
|
|
+}
|
|
|
+
|
|
|
+static Token tokenizer_chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) {
|
|
|
+ if(tokenizer_nextIf(c)) {
|
|
|
+ if(tokenizer_nextIf('=')) {
|
|
|
+ return aCharEqual;
|
|
|
+ }
|
|
|
+ return aChar;
|
|
|
+ } else if(tokenizer_nextIf('=')) {
|
|
|
+
|
|
|
+ return aEqual;
|
|
|
+ }
|
|
|
+ return other;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleLiteral(char32_t c, Token token) {
|
|
|
+ const size_t bufferSize = 1024;
|
|
|
+ char buffer[bufferSize];
|
|
|
+ size_t index = 1;
|
|
|
+ buffer[0] = c;
|
|
|
+
|
|
|
+ while(index < bufferSize - 1) {
|
|
|
+ char32_t data;
|
|
|
+ if(!tokenizer_peek(&data) || !tokenizer_isValidNamePart(data)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ buffer[index++] = data;
|
|
|
+ tokenizer_next(&data);
|
|
|
+ }
|
|
|
+ buffer[index] = '\0';
|
|
|
+
|
|
|
+ if(strcmp(buffer, "if") == 0) {
|
|
|
+ tokenizer_addToken(IF);
|
|
|
+ } else if(strcmp(buffer, "if") == 0) {
|
|
|
+ tokenizer_addToken(IF);
|
|
|
+ } else if(strcmp(buffer, "else") == 0) {
|
|
|
+ tokenizer_addToken(ELSE);
|
|
|
+ } else if(strcmp(buffer, "elseif") == 0) {
|
|
|
+ tokenizer_addToken(ELSEIF);
|
|
|
+ } else if(strcmp(buffer, "while") == 0) {
|
|
|
+ tokenizer_addToken(WHILE);
|
|
|
+ } else if(strcmp(buffer, "try") == 0) {
|
|
|
+ tokenizer_addToken(TRY);
|
|
|
+ } else if(strcmp(buffer, "catch") == 0) {
|
|
|
+ tokenizer_addToken(CATCH);
|
|
|
+ } else if(strcmp(buffer, "for") == 0) {
|
|
|
+ tokenizer_addToken(FOR);
|
|
|
+ } else if(strcmp(buffer, "function") == 0) {
|
|
|
+ tokenizer_addToken(FUNCTION);
|
|
|
+ } else if(strcmp(buffer, "break") == 0) {
|
|
|
+ tokenizer_addToken(BREAK);
|
|
|
+ } else if(strcmp(buffer, "continue") == 0) {
|
|
|
+ tokenizer_addToken(CONTINUE);
|
|
|
+ } else if(strcmp(buffer, "return") == 0) {
|
|
|
+ tokenizer_addToken(RETURN);
|
|
|
+ } else if(strcmp(buffer, "true") == 0) {
|
|
|
+ tokenizer_addToken(TRUE);
|
|
|
+ } else if(strcmp(buffer, "false") == 0) {
|
|
|
+ tokenizer_addToken(FALSE);
|
|
|
+ } else if(strcmp(buffer, "null") == 0) {
|
|
|
+ tokenizer_addToken(NULL_TOKEN);
|
|
|
+ } else {
|
|
|
+
|
|
|
+ tokenizer_addStringToken(token, buffer);
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleNumber(char32_t c) {
|
|
|
+ double number = c - '0';
|
|
|
+ char32_t data;
|
|
|
+ while(tokenizer_peek(&data)) {
|
|
|
+ if(!tokenizer_isDigit(data)) {
|
|
|
+ if(data != '.') {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ tokenizer_next(&data);
|
|
|
+ double factor = 10;
|
|
|
+ while(tokenizer_peek(&data) && tokenizer_isDigit(data)) {
|
|
|
+ number += (data - '0') / factor;
|
|
|
+ factor *= 10;
|
|
|
+ tokenizer_next(&data);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ number = (number * 10) + (data - '0');
|
|
|
+ tokenizer_next(&data);
|
|
|
+ }
|
|
|
+ addDoubleToken(tokens, NUMBER, line, number);
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleString() {
|
|
|
+ const size_t bufferSize = 1024;
|
|
|
+ char buffer[bufferSize];
|
|
|
+ size_t index = 0;
|
|
|
+
|
|
|
+ unsigned int oldLine = line;
|
|
|
+ while(index + 4 < bufferSize) {
|
|
|
+ char32_t data;
|
|
|
+ if(!tokenizer_next(&data)) {
|
|
|
+ tokenizer_onError("non closed string literal", oldLine);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if(data == '"') {
|
|
|
+ buffer[index] = '\0';
|
|
|
+ tokenizer_addStringToken(STRING, buffer);
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ if(data == '\n') {
|
|
|
+ line++;
|
|
|
+ }
|
|
|
+ if(data == '\\') {
|
|
|
+ char32_t escape;
|
|
|
+ if(!tokenizer_next(&escape)) {
|
|
|
+ tokenizer_onError("missing escaped character", line);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ switch(escape) {
|
|
|
+ case 'n': data = '\n';
|
|
|
+ break;
|
|
|
+ case '\\': data = '\\';
|
|
|
+ break;
|
|
|
+ case '"': data = '"';
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ tokenizer_onError("invalid escaped character", line);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ index += tokenizer_printChar(data, buffer + index);
|
|
|
+ }
|
|
|
+ tokenizer_onError("string buffer to small", line);
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleOneLineComment() {
|
|
|
+ char32_t data;
|
|
|
+ while(tokenizer_next(&data) && data != '\n');
|
|
|
+ line++;
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleMultiLineComment() {
|
|
|
+ char32_t first;
|
|
|
+ char32_t sec = 0;
|
|
|
+ unsigned int oldLine = line;
|
|
|
+ while(true) {
|
|
|
+ first = sec;
|
|
|
+ if(!tokenizer_next(&sec)) {
|
|
|
+ tokenizer_onError("unclosed multiline comment", oldLine);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if(first == '*' && sec == '/') {
|
|
|
+
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ line += (sec == '\n');
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleSlash() {
|
|
|
+ if(tokenizer_nextIf('/')) {
|
|
|
+ return tokenizer_handleOneLineComment();
|
|
|
+ } else if(tokenizer_nextIf('*')) {
|
|
|
+ return tokenizer_handleMultiLineComment();
|
|
|
+ } else if(tokenizer_nextIf('=')) {
|
|
|
+ tokenizer_addToken(DIV_SET);
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ tokenizer_addToken(DIV);
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleSpecial(char32_t c) {
|
|
|
+ switch(c) {
|
|
|
+ case ' ':
|
|
|
+ case '\t':
|
|
|
+ case '\r':
|
|
|
+ return false;
|
|
|
+ case '\n': line++;
|
|
|
+ return false;
|
|
|
+ case '"':
|
|
|
+ return tokenizer_handleString();
|
|
|
+ case '(': tokenizer_addToken(OPEN_BRACKET);
|
|
|
+ return false;
|
|
|
+ case ')': tokenizer_addToken(CLOSE_BRACKET);
|
|
|
+ return false;
|
|
|
+ case '[': tokenizer_addToken(OPEN_SQUARE_BRACKET);
|
|
|
+ return false;
|
|
|
+ case ']': tokenizer_addToken(CLOSE_SQUARE_BRACKET);
|
|
|
+ return false;
|
|
|
+ case '{': tokenizer_addToken(OPEN_CURVED_BRACKET);
|
|
|
+ return false;
|
|
|
+ case '}': tokenizer_addToken(CLOSE_CURVED_BRACKET);
|
|
|
+ return false;
|
|
|
+ case '$':
|
|
|
+ return tokenizer_handleLiteral(c, LITERAL);
|
|
|
+ case '@':
|
|
|
+ return tokenizer_handleLiteral(c, LABEL);
|
|
|
+ case ';': tokenizer_addToken(SEMICOLON);
|
|
|
+ return false;
|
|
|
+ case ',': tokenizer_addToken(COMMA);
|
|
|
+ return false;
|
|
|
+ case '~': tokenizer_addToken(BIT_INVERT);
|
|
|
+ return false;
|
|
|
+ case '+': tokenizer_addToken(tokenizer_nextIf('=') ? ADD_SET: (tokenizer_nextIf('+') ? INC: ADD));
|
|
|
+ return false;
|
|
|
+ case '-': tokenizer_addToken(tokenizer_nextIf('=') ? SUB_SET: (tokenizer_nextIf('-') ? DEC: SUB));
|
|
|
+ return false;
|
|
|
+ case '!': tokenizer_addToken(tokenizer_nextIf('=') ? NOT_EQUAL: INVERT);
|
|
|
+ break;
|
|
|
+ case '=': tokenizer_addToken(tokenizer_nextIf('=') ? EQUAL: SET);
|
|
|
+ return false;
|
|
|
+ case '*': tokenizer_addToken(tokenizer_nextIf('=') ? MUL_SET: MUL);
|
|
|
+ return false;
|
|
|
+ case '/':
|
|
|
+ return tokenizer_handleSlash();
|
|
|
+ case '%': tokenizer_addToken(tokenizer_nextIf('=') ? MOD_SET: MOD);
|
|
|
+ return false;
|
|
|
+ case '&': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_AND_SET: (tokenizer_nextIf('&') ? AND: BIT_AND));
|
|
|
+ return false;
|
|
|
+ case '|': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_OR_SET: (tokenizer_nextIf('|') ? OR: BIT_OR));
|
|
|
+ return false;
|
|
|
+ case '^': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_XOR_SET: BIT_XOR);
|
|
|
+ return false;
|
|
|
+ case '<': tokenizer_addToken(tokenizer_chooseToken('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS));
|
|
|
+ return false;
|
|
|
+ case '>': tokenizer_addToken(tokenizer_chooseToken('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER));
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ char buffer[32];
|
|
|
+ strncpy(buffer, "unknown token '", 32);
|
|
|
+ size_t index = strlen(buffer);
|
|
|
+ index += tokenizer_printChar(c, buffer + index);
|
|
|
+ buffer[index] = '\'';
|
|
|
+ buffer[index + 1] = '\0';
|
|
|
+ tokenizer_onError(buffer, line);
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static bool tokenizer_handleChar(char32_t c) {
|
|
|
+ if(tokenizer_isValidNameStart(c)) {
|
|
|
+ return tokenizer_handleLiteral(c, LITERAL);
|
|
|
+ } else if(tokenizer_isDigit(c)) {
|
|
|
+
|
|
|
+ return tokenizer_handleNumber(c);
|
|
|
+ }
|
|
|
+ return tokenizer_handleSpecial(c);
|
|
|
+}
|
|
|
+
|
|
|
+bool tokenize(TokenStream* tokenStream, const char* inputPath) {
|
|
|
+ input = fopen(inputPath, "r");
|
|
|
+ if(input == NULL) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ tokens = tokenStream;
|
|
|
+ line = 1;
|
|
|
+ buffer = 0;
|
|
|
+
|
|
|
+ char32_t c;
|
|
|
+ while(tokenizer_next(&c)) {
|
|
|
+ if(tokenizer_handleChar(c)) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tokenizer_addToken(EOF_TOKEN);
|
|
|
+
|
|
|
+ fclose(input);
|
|
|
+ input = NULL;
|
|
|
+ return false;
|
|
|
+}
|