#include #include #include #include #include #include #include #include "tokenizer/FileTokens.h" #include "tokenizer/Tokenizer.h" #include "utils/SnuviUtils.h" #define TOKEN_BUFFER_LENGTH (1024 * 1024) #define ERROR_LENGTH 256 static FileTokens fileTokens; static int fileTokenIndex; static jmp_buf errorJump; static Error* error; static char tokenBuffer[TOKEN_BUFFER_LENGTH]; static int writeIndex = 0; static int readIndex = 0; static int16 line = 1; static void tError(const char* format, ...) { va_list args; va_start(args, format); eInitErrorV(error, "path not set", line, format, args); va_end(args); longjmp(errorJump, 0); } static void tAdd(const void* data, int length) { if(writeIndex + length > TOKEN_BUFFER_LENGTH) { tError("the token buffer is too small"); } memcpy(tokenBuffer + writeIndex, data, length); writeIndex += length; } static void tAddToken(Token token) { unsigned char c = token; tAdd(&c, 1); tAdd(&line, sizeof(line)); } static bool tReadTokens(void* dest, int length) { if(readIndex + length > writeIndex) { return true; } memcpy(dest, tokenBuffer + readIndex, length); readIndex += length; return false; } static FileToken* tNextToken() { if(fileTokenIndex >= fileTokens.length) { tError("unexpected end of file"); } return fileTokens.tokens + fileTokenIndex++; } static bool tReadSingleIf(char c) { if(fileTokenIndex >= fileTokens.length || fileTokens.tokens[fileTokenIndex].type != FT_SINGLE || fileTokens.tokens[fileTokenIndex].single != c) { return false; } fileTokenIndex++; return true; } static void tParseLiteral(const char* buffer) { if(strcmp(buffer, "return") == 0) { tAddToken(T_RETURN); } else if(strcmp(buffer, "if") == 0) { tAddToken(T_IF); } else if(strcmp(buffer, "else") == 0) { tAddToken(T_ELSE); } else if(strcmp(buffer, "while") == 0) { tAddToken(T_WHILE); } else if(strcmp(buffer, "for") == 0) { tAddToken(T_FOR); } else if(strcmp(buffer, "break") == 0) { tAddToken(T_BREAK); } else if(strcmp(buffer, "continue") == 0) { tAddToken(T_CONTINUE); } else if(strcmp(buffer, "int") == 0) { tAddToken(T_INT); } else if(strcmp(buffer, "void") == 0) { tAddToken(T_VOID); } else if(strcmp(buffer, "float") == 0) { tAddToken(T_FLOAT); } else if(strcmp(buffer, "struct") == 0) { tAddToken(T_STRUCT); } else if(strcmp(buffer, "new") == 0) { tAddToken(T_NEW); } else if(strcmp(buffer, "length") == 0) { tAddToken(T_LENGTH); } else if(strcmp(buffer, "true") == 0) { int32 i = 1; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } else if(strcmp(buffer, "false") == 0) { int32 i = 0; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } else { tAddToken(T_LITERAL); tAdd(buffer, strlen(buffer) + 1); } } static long long tParseInt(const char* s) { long long l = 0; for(int i = 0; s[i] != '\0'; i++) { if(l > (LLONG_MAX / 10)) { tError("invalid number on line %d", line); } l *= 10; if(!isNumber(s[i])) { tError("invalid character in number '%c' on line %d", s[i], line); } int digit = s[i] - '0'; if(l > LLONG_MAX - digit) { tError("invalid number on line %d", line); } l += digit; } return l; } static void tParseNumber(const char* buffer) { long long l = tParseInt(buffer); if(tReadSingleIf('.')) { FileToken* t = tNextToken(); if(t->type != FT_LITERAL) { tError("expected literal after comma of number on line %d", line); } long long comma = tParseInt(t->literal); float f = comma; while(f > 1.0f) { f /= 10.0f; } f += l; tAddToken(T_FLOAT_VALUE); tAdd(&f, sizeof(float)); } else { if(l > INT32_MAX) { tError("invalid int on line %d", line); } int32 i = l; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } } static int32 tNextUnicodePart() { FileToken* t = tNextToken(); if(t->type == FT_SINGLE) { return t->single; } else if(t->type == FT_LITERAL) { int length = strlen(t->literal); if(length != 1) { tError("unicode literal has wrong length %d", length); } return t->literal[0]; } else { tError("cannot read next unicode character part on line %d", line); return 0; } } static int32 tUnicode(int32 c) { if(c == '\\') { switch(tNextUnicodePart()) { case '"': c = '"'; break; case '\\': c = '\\'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: tError("unknown escaped character at line %d", line); } } if((c & 0xE0) == 0xC0) { c = ((c & 0x1F) << 6) | (tNextUnicodePart() & 0x3F); } else if((c & 0xF0) == 0xE0) { c = ((c & 0xF) << 12) | ((tNextUnicodePart() & 0x3F) << 6); c |= tNextUnicodePart(&fileTokens) & 0x3F; } else if((c & 0xF8) == 0xF0) { c = ((c & 0x7) << 18) | ((tNextUnicodePart() & 0x3F) << 12); c |= (tNextUnicodePart() & 0x3F) << 6; c |= tNextUnicodePart() & 0x3F; } return c; } static int32 tReadNextPart(const char* s, int* index, int length) { if(*index >= length) { tError("missing escape character"); } return s[(*index)++]; } static int32 tStringUnicode(const char* s, int* index, int length) { int32 c = s[(*index)++]; if(c == '\\') { switch(tReadNextPart(s, index, length)) { case '"': c = '"'; break; case '\\': c = '\\'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: tError("unknown escaped character at line %d", line); } } if((c & 0xE0) == 0xC0) { c = ((c & 0x1F) << 6) | (tReadNextPart(s, index, length) & 0x3F); } else if((c & 0xF0) == 0xE0) { c = ((c & 0xF) << 12) | ((tReadNextPart(s, index, length) & 0x3F) << 6); c |= tReadNextPart(s, index, length) & 0x3F; } else if((c & 0xF8) == 0xF0) { c = ((c & 0x7) << 18) | ((tReadNextPart(s, index, length) & 0x3F) << 12); c |= (tReadNextPart(s, index, length) & 0x3F) << 6; c |= tReadNextPart(s, index, length) & 0x3F; } return c; } static void tAddString() { tAddToken(T_TEXT); FileToken* t = tNextToken(); if(t->type == FT_SINGLE) { if(t->single != '"') { tError("unexpected single '%d'", t->single); } } else if(t->type == FT_LITERAL) { int length = strlen(t->literal); const char* s = t->literal; int index = 0; while(index < length) { int32 c = tStringUnicode(s, &index, length); tAdd(&c, sizeof(int32)); } if(!tReadSingleIf('"')) { tError("unclosed string"); } } else { tError("unexpected string file token %d", t); } int32 c = 0; tAdd(&c, sizeof(int32)); } static void tAddUnicode() { FileToken* t = tNextToken(); if(t->type == FT_LITERAL) { int length = strlen(t->literal); if(length != 1) { tError("invalid character on line %d", line); } int32 c = t->literal[0]; tAddToken(T_INT_VALUE); tAdd(&c, sizeof(int32)); } else if(t->type == FT_SINGLE) { int32 c = tUnicode(t->single); tAddToken(T_INT_VALUE); tAdd(&c, sizeof(int32)); } else { tError("invalid character on line %d", line); } if(!tReadSingleIf('\'')) { tError("expecting unicode end"); } } static void tAddToken2(Token te, Token t) { if(tReadSingleIf('=')) { tAddToken(te); } else { tAddToken(t); } } static void tAddToken3(int c, Token tc, Token te, Token t) { if(tReadSingleIf(c)) { tAddToken(tc); } else { tAddToken2(te, t); } } static void tAddToken4(int c, Token tce, Token tc, Token te, Token t) { if(tReadSingleIf(c)) { tAddToken2(tce, tc); } else { tAddToken2(te, t); } } static void tParseToken(FileToken* t) { switch(t->type) { case FT_PATH: // TODO: do something useful with the path return; case FT_END_PATH: // TODO: do something useful return; case FT_NEWLINE: line++; return; case FT_LITERAL: { const char* buffer = t->literal; if(isLetter(buffer[0])) { tParseLiteral(buffer); } else if(isNumber(buffer[0])) { tParseNumber(buffer); } else { tError("invalid literal string '%s'", buffer); } return; } case FT_SPACE: return; case FT_SINGLE: { char c = t->single; switch(c) { case ' ': return; case '+': tAddToken3('+', T_INCREMENT, T_ADD_SET, T_ADD); return; case '-': tAddToken3('-', T_DECREMENT, T_SUB_SET, T_SUB); return; case '*': tAddToken2(T_MUL_SET, T_MUL); return; case '/': tAddToken2(T_DIV_SET, T_DIV); return; case '%': tAddToken2(T_MOD_SET, T_MOD); return; case '<': tAddToken4('<', T_LEFT_SHIFT_SET, T_LEFT_SHIFT, T_LESS_EQUAL, T_LESS); return; case '>': tAddToken4('>', T_RIGHT_SHIFT_SET, T_RIGHT_SHIFT, T_GREATER_EQUAL, T_GREATER); return; case '=': tAddToken2(T_EQUAL, T_SET); return; case '!': tAddToken2(T_NOT_EQUAL, T_NOT); return; case '&': tAddToken3('&', T_AND, T_BIT_AND_SET, T_BIT_AND); return; case '|': tAddToken3('|', T_OR, T_BIT_OR_SET, T_BIT_OR); return; case '~': tAddToken(T_BIT_NOT); return; case '^': tAddToken2(T_BIT_XOR_SET, T_BIT_XOR); return; case ',': tAddToken(T_COMMA); return; case ';': tAddToken(T_SEMICOLON); return; case '(': tAddToken(T_OPEN_BRACKET); return; case ')': tAddToken(T_CLOSE_BRACKET); return; case '{': tAddToken(T_OPEN_CURVED_BRACKET); return; case '}': tAddToken(T_CLOSE_CURVED_BRACKET); return; case '"': tAddString(); return; case '\'': tAddUnicode(); return; case '.': tAddToken(T_POINT); return; case '[': tAddToken(T_OPEN_SQUARE_BRACKET); return; case ']': tAddToken(T_CLOSE_SQUARE_BRACKET); return; } if(isprint(c)) { tError("unknown character on line %d: %c", line, c); } else { tError("unknown character on line %d: %d", line, (int)c); } } } } static void tParseFile() { readIndex = 0; writeIndex = 0; line = 1; fileTokenIndex = 0; while(fileTokenIndex < fileTokens.length) { tParseToken(fileTokens.tokens + fileTokenIndex++); } } void tTokenize(const char* path, Error* e) { error = e; ftInit(path, &fileTokens, e); if(!eHasError(e) && !setjmp(errorJump)) { tParseFile(); } if(eHasError(e)) { ftPrint(&fileTokens); } ftDelete(&fileTokens); } Token tPeekToken() { if(readIndex >= writeIndex) { return T_END; } return tokenBuffer[readIndex]; } bool tReadTokenAndLine(Token* t, int16* line) { if(readIndex >= writeIndex) { return true; } *t = tokenBuffer[readIndex++]; return tReadTokens(line, sizeof(int16)); } bool tReadInt32(int32* i) { return tReadTokens(i, sizeof(int32)); } bool tReadFloat(float* f) { return tReadTokens(f, sizeof(float)); } const char* tReadString() { const char* s = tokenBuffer + readIndex; while(readIndex <= writeIndex) { if(tokenBuffer[readIndex++] == '\0') { return s; } } return NULL; } int tGetMarker() { return readIndex; } void tReset(int marker) { readIndex = marker; }