#include #include #include #include #include #include #include "tokenizer/FileReader.h" #include "tokenizer/Tokenizer.h" #include "utils/SnuviUtils.h" #define TOKEN_BUFFER_LENGTH (1024 * 1024) #define ERROR_LENGTH 256 static FileReader fileReader; static jmp_buf errorJump; static Error* error; static char tokenBuffer[TOKEN_BUFFER_LENGTH]; static int writeIndex = 0; static int readIndex = 0; static int16 line = 1; static void tError(const char* format, ...) { va_list args; va_start(args, format); eInitErrorV(error, line, format, args); va_end(args); longjmp(errorJump, 0); } static void tAdd(const void* data, int length) { if(writeIndex + length > TOKEN_BUFFER_LENGTH) { tError("the token buffer is too small"); } memcpy(tokenBuffer + writeIndex, data, length); writeIndex += length; } static void tAddToken(Token token) { unsigned char c = token; tAdd(&c, 1); tAdd(&line, sizeof(line)); } static bool tReadTokens(void* dest, int length) { if(readIndex + length > writeIndex) { return true; } memcpy(dest, tokenBuffer + readIndex, length); readIndex += length; return false; } static void tParseLiteral(int c) { int index = 1; char buffer[64]; buffer[0] = c; while(isAllowedInName(frPeek(&fileReader))) { if(index >= 63) { tError("literal is too long"); } buffer[index++] = frRead(&fileReader); } buffer[index] = '\0'; if(strcmp(buffer, "return") == 0) { tAddToken(T_RETURN); } else if(strcmp(buffer, "if") == 0) { tAddToken(T_IF); } else if(strcmp(buffer, "else") == 0) { tAddToken(T_ELSE); } else if(strcmp(buffer, "while") == 0) { tAddToken(T_WHILE); } else if(strcmp(buffer, "for") == 0) { tAddToken(T_FOR); } else if(strcmp(buffer, "break") == 0) { tAddToken(T_BREAK); } else if(strcmp(buffer, "continue") == 0) { tAddToken(T_CONTINUE); } else if(strcmp(buffer, "int") == 0) { tAddToken(T_INT); } else if(strcmp(buffer, "void") == 0) { tAddToken(T_VOID); } else if(strcmp(buffer, "float") == 0) { tAddToken(T_FLOAT); } else if(strcmp(buffer, "struct") == 0) { tAddToken(T_STRUCT); } else if(strcmp(buffer, "new") == 0) { tAddToken(T_NEW); } else if(strcmp(buffer, "length") == 0) { tAddToken(T_LENGTH); } else if(strcmp(buffer, "true") == 0) { int32 i = 1; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } else if(strcmp(buffer, "false") == 0) { int32 i = 0; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } else { tAddToken(T_LITERAL); tAdd(buffer, index + 1); } } static bool tParseInt(char* s, int length, long long* l) { *l = 0; for(int i = 0; i < length; i++) { if(*l > (LLONG_MAX / 10)) { return true; } *l *= 10; int digit = s[i] - '0'; if(*l > LLONG_MAX - digit) { return true; } *l += digit; } return false; } static void tParseNumber(int c) { int index = 1; char buffer[64]; buffer[0] = c; bool point = false; while(true) { int c = frPeek(&fileReader); if(c == '.') { point = true; } else if(!isNumber(c)) { break; } else if(index >= 63) { tError("number is too long"); } buffer[index++] = frRead(&fileReader); } buffer[index] = '\0'; if(point) { char* end = NULL; float f = strtof(buffer, &end); if(end[0] != '\0') { tError("invalid float on line %d", line); } tAddToken(T_FLOAT_VALUE); tAdd(&f, sizeof(float)); } else { long long l; if(tParseInt(buffer, index, &l) || l > INT32_MAX) { tError("invalid int on line %d", line); } int32 i = l; tAddToken(T_INT_VALUE); tAdd(&i, sizeof(int32)); } } static int32 tUnicode(int32 c) { if(c == '\\') { switch(frRead(&fileReader)) { case '"': c = '"'; break; case '\\': c = '\\'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: tError("unknown escaped character at line %d", line); } } if((c & 0xE0) == 0xC0) { c = ((c & 0x1F) << 6) | (frRead(&fileReader) & 0x3F); } else if((c & 0xF0) == 0xE0) { c = ((c & 0xF) << 12) | ((frRead(&fileReader) & 0x3F) << 6); c |= frRead(&fileReader) & 0x3F; } else if((c & 0xF8) == 0xF0) { c = ((c & 0x7) << 18) | ((frRead(&fileReader) & 0x3F) << 12); c |= (frRead(&fileReader) & 0x3F) << 6; c |= frRead(&fileReader) & 0x3F; } return c; } static void tAddString() { tAddToken(T_TEXT); while(true) { int32 c = frRead(&fileReader); if(c == '"') { break; } else if(c == EOF) { tError("unclosed string starting at line %d", line); } c = tUnicode(c); tAdd(&c, sizeof(int32)); } int32 c = 0; tAdd(&c, sizeof(int32)); } static void tAddUnicode() { int32 c = frRead(&fileReader); c = tUnicode(c); tAddToken(T_INT_VALUE); tAdd(&c, sizeof(int32)); if(frRead(&fileReader) != '\'') { tError("expecting unicode end"); } } static void tAddToken2(Token te, Token t) { if(frReadIf(&fileReader, '=')) { tAddToken(te); } else { tAddToken(t); } } static void tAddToken3(int c, Token tc, Token te, Token t) { if(frReadIf(&fileReader, c)) { tAddToken(tc); } else { tAddToken2(te, t); } } static void tAddToken4(int c, Token tce, Token tc, Token te, Token t) { if(frReadIf(&fileReader, c)) { tAddToken2(tce, tc); } else { tAddToken2(te, t); } } static void tLineComment() { while(true) { int c = frRead(&fileReader); if(c == EOF || c == '\n') { line++; return; } } } static void tMultipleLineComment() { while(true) { int c = frRead(&fileReader); if(c == EOF) { tError("unclosed comment at line %d", line); } else if(c == '\n') { line++; } else if(c == '*' && frReadIf(&fileReader, '/')) { return; } } } static void tSlash() { if(frReadIf(&fileReader, '/')) { tLineComment(); } else if(frReadIf(&fileReader, '*')) { tMultipleLineComment(); } else { tAddToken2(T_DIV_SET, T_DIV); } } static void tParseToken(int c) { if(isLetter(c)) { tParseLiteral(c); return; } else if(isNumber(c)) { tParseNumber(c); return; } switch(c) { case ' ': return; case '\n': line++; return; case '+': tAddToken3('+', T_INCREMENT, T_ADD_SET, T_ADD); return; case '-': tAddToken3('-', T_DECREMENT, T_SUB_SET, T_SUB); return; case '*': tAddToken2(T_MUL_SET, T_MUL); return; case '/': tSlash(); return; case '%': tAddToken2(T_MOD_SET, T_MOD); return; case '<': tAddToken4('<', T_LEFT_SHIFT_SET, T_LEFT_SHIFT, T_LESS_EQUAL, T_LESS); return; case '>': tAddToken4('>', T_RIGHT_SHIFT_SET, T_RIGHT_SHIFT, T_GREATER_EQUAL, T_GREATER); return; case '=': tAddToken2(T_EQUAL, T_SET); return; case '!': tAddToken2(T_NOT_EQUAL, T_NOT); return; case '&': tAddToken3('&', T_AND, T_BIT_AND_SET, T_BIT_AND); return; case '|': tAddToken3('|', T_OR, T_BIT_OR_SET, T_BIT_OR); return; case '~': tAddToken(T_BIT_NOT); return; case '^': tAddToken2(T_BIT_XOR_SET, T_BIT_XOR); return; case ',': tAddToken(T_COMMA); return; case ';': tAddToken(T_SEMICOLON); return; case '(': tAddToken(T_OPEN_BRACKET); return; case ')': tAddToken(T_CLOSE_BRACKET); return; case '{': tAddToken(T_OPEN_CURVED_BRACKET); return; case '}': tAddToken(T_CLOSE_CURVED_BRACKET); return; case '"': tAddString(); return; case '\'': tAddUnicode(); return; case '.': tAddToken(T_POINT); return; case '[': tAddToken(T_OPEN_SQUARE_BRACKET); return; case ']': tAddToken(T_CLOSE_SQUARE_BRACKET); return; } tError("unknown character on line %d: %c", line, c); } static void tParseFile() { readIndex = 0; writeIndex = 0; line = 1; while(true) { int c = frRead(&fileReader); if(c == EOF) { return; } tParseToken(c); } } void tTokenize(const char* path, Error* e) { error = e; frInit(path, &fileReader, e); if(!eHasError(e) && !setjmp(errorJump)) { tParseFile(); } frDelete(&fileReader); } Token tPeekToken() { if(readIndex >= writeIndex) { return T_END; } return tokenBuffer[readIndex]; } bool tReadTokenAndLine(Token* t, int16* line) { if(readIndex >= writeIndex) { return true; } *t = tokenBuffer[readIndex++]; return tReadTokens(line, sizeof(int16)); } bool tReadInt32(int32* i) { return tReadTokens(i, sizeof(int32)); } bool tReadFloat(float* f) { return tReadTokens(f, sizeof(float)); } const char* tReadString() { const char* s = tokenBuffer + readIndex; while(readIndex <= writeIndex) { if(tokenBuffer[readIndex++] == '\0') { return s; } } return NULL; } int tGetMarker() { return readIndex; } void tReset(int marker) { readIndex = marker; }