#include #include #include #include #include #include #include "tokenizer/File.h" #include "tokenizer/Tokenizer.h" #include "utils/Utils.h" #define TOKEN_BUFFER_LENGTH (1024 * 1024) #define ERROR_LENGTH 256 static jmp_buf errorJump; static char tokenBuffer[TOKEN_BUFFER_LENGTH]; static int writeIndex = 0; static int readIndex = 0; static int16 line = 1; static char error[ERROR_LENGTH] = {'\0'}; static void tError(const char* format, ...) { va_list args; va_start(args, format); vsnprintf(error, ERROR_LENGTH, format, args); va_end(args); longjmp(errorJump, 0); } static void tAdd(const void* data, int length) { if(writeIndex + length > TOKEN_BUFFER_LENGTH) { tError("the token buffer is too small"); } memcpy(tokenBuffer + writeIndex, data, length); writeIndex += length; } static void tAddToken(Token token) { unsigned char c = token; tAdd(&c, 1); tAdd(&line, sizeof(line)); } static bool tReadTokens(void* dest, int length) { if(readIndex + length > writeIndex) { return false; } memcpy(dest, tokenBuffer + readIndex, length); readIndex += length; return true; } static void tParseLiteral(int c) { int index = 1; char buffer[64]; buffer[0] = c; while(isAllowedInName(fPeek())) { if(index >= 63) { tError("literal is too long"); } buffer[index++] = fRead(); } buffer[index] = '\0'; Token t = tFromName(buffer); if(t != T_END) { tAddToken(t); } else { tAddToken(T_LITERAL); tAdd(buffer, index + 1); } } static bool tParseInt(char* s, int length, long long* l) { *l = 0; for(int i = 0; i < length; i++) { if(*l > (LLONG_MAX / 10)) { return true; } *l *= 10; int digit = s[i] - '0'; if(*l > LLONG_MAX - digit) { return true; } *l += digit; } return false; } static void tParseNumber(int c) { int index = 1; char buffer[64]; buffer[0] = c; bool point = false; while(true) { int c = fPeek(); if(c == '.') { point = true; } else if(!isNumber(c)) { break; } else if(index >= 63) { tError("number is too long"); } buffer[index++] = fRead(); } buffer[index] = '\0'; if(fPeek() == 'L' || fPeek() == 'l') { fRead(); if(point) { tError("invalid mix of long and float", line); } long long l; if(tParseInt(buffer, index, &l) || l > INT64_MAX) { tError("invalid long on line %d", line); } int64 i = l; tAddToken(T_CONST_INT64); tAdd(&i, sizeof(int64)); } else if(point) { char* end = NULL; float f = strtof(buffer, &end); if(end[0] != '\0') { tError("invalid float on line %d", line); } tAddToken(T_CONST_FLOAT); tAdd(&f, sizeof(float)); } else { long long l; if(tParseInt(buffer, index, &l) || l > INT32_MAX) { tError("invalid int on line %d", line); } int32 i = l; tAddToken(T_CONST_INT32); tAdd(&i, sizeof(int32)); } } static void tAddString() { tAddToken(T_TEXT); while(true) { int c = fRead(); if(c == '"') { break; } else if(c == '\\') { switch(fRead()) { case '"': c = '"'; break; case '\\': c = '\\'; break; default: tError("unknown escaped character at line %d", line); } } else if(c == EOF) { tError("unclosed string starting at line %d", line); } tAdd(&c, 1); } char c = '\0'; tAdd(&c, 1); } static void tAddUnicode() { int32 c = fRead(); if((c & 0xE0) == 0xC0) { c = ((c & 0x1F) << 6) | (fRead() & 0x3F); } else if((c & 0xF0) == 0xE0) { c = ((c & 0xF) << 12) | ((fRead() & 0x3F) << 6); c |= fRead() & 0x3F; } else if((c & 0xF8) == 0xF0) { c = ((c & 0x7) << 18) | ((fRead() & 0x3F) << 12); c |= (fRead() & 0x3F) << 6; c |= fRead() & 0x3F; } tAddToken(T_CONST_INT32); tAdd(&c, sizeof(int32)); if(fRead() != '\'') { tError("expecting unicode end"); } } static void tAddToken2(Token te, Token t) { if(fReadIf('=')) { tAddToken(te); } else { tAddToken(t); } } static void tAddToken3(int c, Token tc, Token te, Token t) { if(fReadIf(c)) { tAddToken(tc); } else { tAddToken2(te, t); } } static void tAddToken4(int c, Token tce, Token tc, Token te, Token t) { if(fReadIf(c)) { tAddToken2(tce, tc); } else { tAddToken2(te, t); } } static void tAddTokenMinus() { tAddToken3('-', T_DECREMENT, T_SUB_SET, fReadIf('>') ? T_ARROW : T_SUB); } static void tLineComment() { while(true) { int c = fRead(); if(c == EOF || c == '\n') { line++; return; } } } static void tMultipleLineComment() { while(true) { int c = fRead(); if(c == EOF) { tError("unclosed comment at line %d", line); } else if(c == '\n') { line++; } else if(c == '*' && fReadIf('/')) { return; } } } static void tSlash() { if(fReadIf('/')) { tLineComment(); } else if(fReadIf('*')) { tMultipleLineComment(); } else { tAddToken2(T_DIV_SET, T_DIV); } } static void tParseToken(int c) { if(isLetter(c)) { tParseLiteral(c); return; } else if(isNumber(c)) { tParseNumber(c); return; } switch(c) { case ' ': return; case '\n': line++; return; case '+': tAddToken3('+', T_INCREMENT, T_ADD_SET, T_ADD); return; case '-': tAddTokenMinus(); return; case '*': tAddToken2(T_MUL_SET, T_MUL); return; case '/': tSlash(); return; case '%': tAddToken2(T_MOD_SET, T_MOD); return; case '<': tAddToken4('<', T_LEFT_SHIFT_SET, T_LEFT_SHIFT, T_LESS_EQUAL, T_LESS); return; case '>': tAddToken4('>', T_RIGHT_SHIFT_SET, T_RIGHT_SHIFT, T_GREATER_EQUAL, T_GREATER); return; case '=': tAddToken2(T_EQUAL, T_SET); return; case '!': tAddToken2(T_NOT_EQUAL, T_NOT); return; case '&': tAddToken3('&', T_AND, T_BIT_AND_SET, T_BIT_AND); return; case '|': tAddToken3('|', T_OR, T_BIT_OR_SET, T_BIT_OR); return; case '~': tAddToken(T_BIT_NOT); return; case '^': tAddToken2(T_BIT_XOR_SET, T_BIT_XOR); return; case ',': tAddToken(T_COMMA); return; case ';': tAddToken(T_SEMICOLON); return; case '(': tAddToken(T_OPEN_BRACKET); return; case ')': tAddToken(T_CLOSE_BRACKET); return; case '{': tAddToken(T_OPEN_CURVED_BRACKET); return; case '}': tAddToken(T_CLOSE_CURVED_BRACKET); return; case '"': tAddString(); return; case '\'': tAddUnicode(); return; case '.': tAddToken(T_POINT); return; case '[': tAddToken(T_OPEN_SQUARE_BRACKET); return; case ']': tAddToken(T_CLOSE_SQUARE_BRACKET); return; } tError("unknown character on line %d: %c", line, c); } static void tParseFile() { readIndex = 0; writeIndex = 0; line = 1; error[0] = '\0'; while(true) { int c = fRead(); if(c == EOF) { return; } tParseToken(c); } } bool tTokenize(const char* path) { if(!setjmp(errorJump)) { fOpen(path, tError); tParseFile(); } fClose(); return error[0] != '\0'; } const char* tGetError() { return error; } int tGetLine() { return line; } void tResetReader() { readIndex = 0; } Token tPeekToken() { if(readIndex >= writeIndex) { return T_END; } return tokenBuffer[readIndex]; } Token tReadToken() { if(readIndex >= writeIndex) { return T_END; } return tokenBuffer[readIndex++]; } bool tReadInt16(int16* i) { return tReadTokens(i, sizeof(int16)); } bool tReadInt32(int32* i) { return tReadTokens(i, sizeof(int32)); } bool tReadInt64(int64* i) { return tReadTokens(i, sizeof(int64)); } bool tReadFloat(float* f) { return tReadTokens(f, sizeof(float)); } const char* tReadString() { const char* s = tokenBuffer + readIndex; while(readIndex <= writeIndex) { if(tokenBuffer[readIndex++] == '\0') { return s; } } return NULL; } int tGetMarker() { return readIndex; } void tReset(int marker) { readIndex = marker; }