فهرست منبع

Refactor to struct tokenizer

Kajetan Johannes Hammerle 2 هفته پیش
والد
کامیت
4993e50afc
9فایلهای تغییر یافته به همراه96 افزوده شده و 351 حذف شده
  1. 2 0
      src/Code.c
  2. 1 1
      src/Code.h
  3. 63 322
      src/Compiler.c
  4. 2 1
      src/Compiler.h
  5. 0 6
      src/Constants.h
  6. 23 19
      src/Main.c
  7. 2 1
      src/Tokenizer.c
  8. 2 0
      src/Values.c
  9. 1 1
      src/Values.h

+ 2 - 0
src/Code.c

@@ -3,6 +3,8 @@
 #include <assert.h>
 #include <string.h>
 
+#include "Constants.h"
+
 static u8 code[MAX_CODE];
 static size_t codeIndex = 0;
 static size_t codeExecutionIndex = 0;

+ 1 - 1
src/Code.h

@@ -1,7 +1,7 @@
 #ifndef BASIC_CODE_H
 #define BASIC_CODE_H
 
-#include "Constants.h"
+#include "Types.h"
 
 typedef enum : u8 {
     ADD,

+ 63 - 322
src/Compiler.c

@@ -1,397 +1,138 @@
 #include "Compiler.h"
 
-#include <errno.h>
 #include <setjmp.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "Code.h"
-#include "Constants.h"
 
 static Error error = {};
-static FILE* file = nullptr;
-static size_t lineCounter = 0;
 static jmp_buf jumpPosition = {};
-static u8 tokens[MAX_TOKENS] = {};
-static size_t tokenReadIndex = 0;
-static size_t tokenWriteIndex = 0;
 
-typedef enum : u8 { LITERAL, INT64, DOUBLE, STRING, PLUS, NEWLINE, END } Token;
-
-#define THROW_ERROR(...)                                   \
-    snprintf(error.text, sizeof(error.text), __VA_ARGS__); \
+#define THROW_ERROR(format, ...)                                       \
+    snprintf(                                                          \
+        error.text, sizeof(error.text), "Line %zu | " format, t->line, \
+        __VA_ARGS__);                                                  \
     longjmp(jumpPosition, 1)
 
-#define CODE(command)                                              \
-    do {                                                           \
-        if(command) {                                              \
-            THROW_ERROR("Code overflow on line %zu", lineCounter); \
-        }                                                          \
+#define CODE(command)                                         \
+    do {                                                      \
+        if(command) {                                         \
+            THROW_ERROR("Line %zu | Code overflow", t->line); \
+        }                                                     \
     } while(false)
 
-static void cleanup() {
-    if(file != nullptr) {
-        fclose(file);
-        file = nullptr;
-    }
-}
-
-static void reset() {
-    error.text[0] = '\0';
-    lineCounter = 0;
-    tokenReadIndex = 0;
-    tokenWriteIndex = 0;
-}
-
-static void addTokenN(const void* p, size_t n) {
-    if(tokenWriteIndex + n > MAX_TOKENS) {
-        THROW_ERROR("Line %zu has too much tokens", lineCounter);
-    }
-    memcpy(tokens + tokenWriteIndex, p, n);
-    tokenWriteIndex += n;
-}
-
-static void addToken(Token t) {
-    addTokenN(&t, sizeof(t));
-}
-
-static void addChar(char c) {
-    addTokenN(&c, sizeof(c));
-}
-
-static bool isLetter(char c) {
-    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
-static bool isNumber(char c) {
-    return c >= '0' && c <= '9';
-}
-
-static bool isAlphaNumeric(char c) {
-    return isLetter(c) || isNumber(c);
-}
-
-[[noreturn]] static void invalidToken(char c, int line) {
-    THROW_ERROR("Unexpected token '%c' on line %zu %d", c, lineCounter, line);
-}
-
-static bool isTokenEnd(char c) {
-    return c == ' ' || c == '\0' || c == '\n';
+[[noreturn]] static void unexpectedToken(Tokenizer* t, Token token) {
+    char buffer[128];
+    tokenizerPrintToken(&token, buffer, sizeof(buffer));
+    THROW_ERROR("Line %zu | Unexpected %s token", t->line, buffer);
 }
 
-static const char* tokenizeLiteral(const char* s) {
-    addToken(LITERAL);
-    addChar(*s);
-    while(true) {
-        char c = *(++s);
-        if(isAlphaNumeric(c)) {
-            addChar(c);
-        } else if(isTokenEnd(c)) {
-            break;
-        } else {
-            invalidToken(c, __LINE__);
-        }
-    }
-    addChar('\0');
-    return s;
-}
-
-[[noreturn]] static void invalidNumber() {
-    THROW_ERROR("Invalid number on line %zu", lineCounter);
-}
-
-static const char* tokenizeNumber(const char* s) {
-    size_t nIndex = 0;
-    char number[64] = {};
-    number[nIndex++] = *s;
-    while(true) {
-        char c = *(++s);
-        if(isTokenEnd(c)) {
-            break;
-        } else if(!isNumber(c) && c != '.') {
-            invalidToken(c, __LINE__);
-        } else if(nIndex >= sizeof(number) - 1) {
-            invalidNumber();
-        }
-        number[nIndex++] = c;
+static Token consumeToken(Tokenizer* t, TokenType type) {
+    Token actual = tokenizerNext(t);
+    if(actual.type != type) {
+        unexpectedToken(t, actual);
     }
-    char* end = nullptr;
-    errno = 0;
-    i64 i = strtoll(number, &end, 10);
-    if(errno != 0) {
-        invalidNumber();
-    } else if(*end == '\0') {
-        addToken(INT64);
-        addTokenN(&i, sizeof(i));
-        return s;
-    }
-    double d = strtod(number, &end);
-    if(errno != 0) {
-        invalidNumber();
-    } else if(*end != '\0') {
-        invalidNumber();
-    }
-    addToken(DOUBLE);
-    addTokenN(&d, sizeof(d));
-    return s;
+    return actual;
 }
 
-static const char* tokenizeString(const char* s) {
-    addToken(STRING);
-    while(true) {
-        char c = *(++s);
-        if(c == '\0') {
-            THROW_ERROR("Unclosed string on line %zu", lineCounter);
-        } else if(c == '"') {
-            s++;
-            break;
-        }
-        addChar(c);
-    }
-    addChar('\0');
-    return s;
-}
-
-static void tokenizeLineString(const char* s) {
-    while(true) {
-        char c = *s;
-        if(isLetter(c)) {
-            s = tokenizeLiteral(s);
-        } else if(isNumber(c)) {
-            s = tokenizeNumber(s);
-        } else if(c == '"') {
-            s = tokenizeString(s);
-        } else if(c == '\n') {
-            addToken(NEWLINE);
-            break;
-        } else if(c == ' ') {
-            s++;
-        } else if(c == '+') {
-            addToken(PLUS);
-            s++;
-        } else if(c == '\0') {
-            break;
-        } else {
-            invalidToken(c, __LINE__);
-        }
-    }
-}
-
-static void tokenizeLine() {
-    tokenReadIndex = 0;
-    tokenWriteIndex = 0;
-    lineCounter++;
-    char line[256] = {};
-    if(fgets(line, sizeof(line), file) == nullptr) {
-        return;
-    }
-    char c = line[sizeof(line) - 2];
-    if(c != '\n' && c != '\0') {
-        THROW_ERROR("Line %zu is too long", lineCounter);
-    }
-    tokenizeLineString(line);
-}
-
-static Token peekToken() {
-    if(tokenReadIndex < tokenWriteIndex) {
-        return tokens[tokenReadIndex];
-    }
-    tokenizeLine();
-    return tokenReadIndex < tokenWriteIndex ? tokens[tokenReadIndex] : END;
-}
-
-static Token nextToken() {
-    if(tokenReadIndex < tokenWriteIndex) {
-        return tokens[tokenReadIndex++];
-    }
-    tokenizeLine();
-    return tokenReadIndex < tokenWriteIndex ? tokens[tokenReadIndex++] : END;
-}
-
-static const char* peekLiteral() {
-    Token t = peekToken();
-    if(t != LITERAL) {
-        return nullptr;
-    }
-    return (char*)(tokens + tokenReadIndex + sizeof(Token));
-}
-
-static const char* readString() {
-    if(tokenReadIndex >= tokenWriteIndex) {
-        THROW_ERROR("readString on empty buffer, line %zu", lineCounter);
-    }
-    const char* c = (char*)(tokens + tokenReadIndex);
-    while(tokenReadIndex < tokenWriteIndex && tokens[tokenReadIndex] != '\0') {
-        tokenReadIndex++;
-    }
-    tokenReadIndex++;
-    return c;
-}
-
-static i64 readInt64() {
-    if(tokenReadIndex + sizeof(i64) >= tokenWriteIndex) {
-        THROW_ERROR("readInt64 on empty buffer, line %zu", lineCounter);
-    }
-    i64 i = 0;
-    memcpy(&i, tokens + tokenReadIndex, sizeof(i));
-    tokenReadIndex += sizeof(i);
-    return i;
-}
-
-static double readDouble() {
-    if(tokenReadIndex + sizeof(double) >= tokenWriteIndex) {
-        THROW_ERROR("readDouble on empty buffer, line %zu", lineCounter);
-    }
-    double d = 0;
-    memcpy(&d, tokens + tokenReadIndex, sizeof(d));
-    tokenReadIndex += sizeof(d);
-    return d;
-}
-
-[[noreturn]] static void unexpectedToken(Token t) {
-    switch(t) {
-        case LITERAL:
-            THROW_ERROR(
-                "Unexpected literal(%s) on line %zu", readString(),
-                lineCounter);
-            break;
-        case INT64:
-            THROW_ERROR(
-                "Unexpected int(%ld) on line %zu", readInt64(), lineCounter);
-            break;
-        case DOUBLE:
-            THROW_ERROR(
-                "Unexpected double(%lf) on line %zu", readDouble(),
-                lineCounter);
-            break;
-        case STRING:
-            THROW_ERROR(
-                "Unexpected string(%s) on line %zu", readString(), lineCounter);
-            break;
-        case PLUS:
-            THROW_ERROR("Unexpected plus on line %zu", lineCounter);
-            break;
-        case NEWLINE:
-            THROW_ERROR("Unexpected newline on line %zu", lineCounter);
-            break;
-        case END: THROW_ERROR("Unexpected end on line %zu", lineCounter); break;
-    }
-    THROW_ERROR("Unexpected unknown token on line %zu", lineCounter);
-}
-
-static void consumeToken(Token t) {
-    Token actual = nextToken();
-    if(t != actual) {
-        unexpectedToken(actual);
-    }
-}
-
-static void consumeLiteral(const char* name) {
-    consumeToken(LITERAL);
-    const char* actual = readString();
+static void consumeLiteral(Tokenizer* t, const char* name) {
+    Token token = consumeToken(t, LITERAL);
+    const char* actual = token.stringValue;
     if(strcmp(actual, name) != 0) {
-        THROW_ERROR("Unexpected literal(%s) on line %zu", actual, lineCounter);
+        THROW_ERROR("Unexpected literal(%s)", actual);
     }
 }
 
-static void compileConstant() {
-    Token t = nextToken();
-    if(t == STRING) {
+static void compileConstant(Tokenizer* t) {
+    Token token = tokenizerNext(t);
+    if(token.type == STRING) {
         CODE(codePushInstruction(PUSH_CONSTANT_STRING));
-        CODE(codePushConstantString(readString()));
-    } else if(t == INT64) {
+        CODE(codePushConstantString(token.stringValue));
+    } else if(token.type == INT64) {
         CODE(codePushInstruction(PUSH_INT64));
-        CODE(codePushI64(readInt64()));
+        CODE(codePushI64(token.intValue));
     } else {
-        unexpectedToken(t);
+        unexpectedToken(t, token);
     }
 }
 
-static void compileAdd() {
-    compileConstant();
-    while(peekToken() == PLUS) {
-        nextToken();
-        compileConstant();
+static void compileAdd(Tokenizer* t) {
+    compileConstant(t);
+    while(tokenizerPeek(t).type == PLUS) {
+        tokenizerNext(t);
+        compileConstant(t);
         CODE(codePushInstruction(ADD));
     }
 }
 
-static void compileExpression() {
-    compileAdd();
+static void compileExpression(Tokenizer* t) {
+    compileAdd(t);
 }
 
-static void compileLine(Token t);
+static void compileLine(Tokenizer* t, Token token);
 
-static void compileIf() {
-    compileExpression();
+static void compileIf(Tokenizer* t) {
+    compileExpression(t);
     CODE(codePushInstruction(JUMP_ON_0));
     size_t posIndex = codeGetWritePosition();
     CODE(codePushSize(0));
-    consumeLiteral("then");
-    consumeToken(NEWLINE);
+    consumeLiteral(t, "then");
+    consumeToken(t, NEWLINE);
     while(true) {
-        const char* s = peekLiteral();
-        if(s == nullptr || strcmp(s, "endif") != 0) {
-            compileLine(nextToken());
-            continue;
-        } else {
+        Token token = tokenizerPeek(t);
+        if(token.type == LITERAL && strcmp(token.stringValue, "endif") == 0) {
             break;
         }
+        compileLine(t, tokenizerNext(t));
     }
-    consumeLiteral("endif");
-    consumeToken(NEWLINE);
+    consumeLiteral(t, "endif");
+    consumeToken(t, NEWLINE);
     size_t endIndex = codeGetWritePosition();
     codeSetWritePosition(posIndex);
     CODE(codePushSize(endIndex));
     codeSetWritePosition(endIndex);
 }
 
-static void compileLine(Token t) {
-    if(t == NEWLINE) {
+static void compileLine(Tokenizer* t, Token token) {
+    if(token.type == NEWLINE) {
         return;
     }
-    if(t != LITERAL) {
-        unexpectedToken(t);
+    if(token.type != LITERAL) {
+        unexpectedToken(t, token);
     }
-    const char* s = readString();
+    const char* s = token.stringValue;
     if(strcmp(s, "print") == 0) {
-        while(peekToken() != NEWLINE) {
-            compileExpression();
+        while(tokenizerPeek(t).type != NEWLINE) {
+            compileExpression(t);
             CODE(codePushInstruction(PRINT));
         }
-        nextToken();
+        tokenizerNext(t);
         CODE(codePushInstruction(PRINT_NEWLINE));
     } else if(strcmp(s, "if") == 0) {
-        compileIf();
+        compileIf(t);
     } else {
-        THROW_ERROR("Unexpected literal(%s) on line %zu", s, lineCounter);
+        THROW_ERROR("Unexpected literal(%s)", s);
     }
 }
 
-static void parseTokens() {
+static void parseTokens(Tokenizer* t) {
     codeReset();
     while(true) {
-        Token t = nextToken();
-        if(t == END) {
+        Token token = tokenizerNext(t);
+        if(token.type == END) {
             break;
         }
-        compileLine(t);
+        compileLine(t, token);
     }
 }
 
-const Error* compileFile(const char* path) {
-    reset();
+const Error* compileFile(Tokenizer* t) {
+    error.text[0] = '\0';
     if(setjmp(jumpPosition)) {
-        cleanup();
         return &error;
     }
-    file = fopen(path, "r");
-    if(file == nullptr) {
-        THROW_ERROR("Cannot read file '%s'", path);
-    } else {
-        parseTokens();
-    }
-    cleanup();
+    parseTokens(t);
     return &error;
 }

+ 2 - 1
src/Compiler.h

@@ -2,7 +2,8 @@
 #define BASIC_COMPILER_H
 
 #include "Error.h"
+#include "Tokenizer.h"
 
-const Error* compileFile(const char* path);
+const Error* compileFile(Tokenizer* t);
 
 #endif

+ 0 - 6
src/Constants.h

@@ -2,14 +2,8 @@
 #define BASIC_CONSTANTS_H
 
 #include <stddef.h>
-#include <stdint.h>
-
-typedef uint64_t u64;
-typedef uint8_t u8;
-typedef int64_t i64;
 
 [[maybe_unused]] constexpr size_t MAX_CODE = 1024 * 1024 * 2;
 [[maybe_unused]] constexpr size_t MAX_VALUES = 1024;
-[[maybe_unused]] constexpr size_t MAX_TOKENS = 1024;
 
 #endif

+ 23 - 19
src/Main.c

@@ -7,6 +7,15 @@
 
 static u8 tokens[1000];
 
+static void compileAndRun(Tokenizer* t) {
+    const Error* e = compileFile(t);
+    if(hasError(e)) {
+        puts(e->text);
+        return;
+    }
+    codeRun();
+}
+
 int main(int argCount, const char** args) {
     if(argCount < 2) {
         return 0;
@@ -18,26 +27,21 @@ int main(int argCount, const char** args) {
         return 0;
     }
 
-    while(true) {
-        Token token = tokenizerNext(&t);
-        if(tokenizerHasError(&t)) {
-            puts(tokenizerGetError(&t));
-            break;
-        }
-        char buffer[256];
-        tokenizerPrintToken(&token, buffer, sizeof(buffer));
-        puts(buffer);
-        if(token.type == END) {
-            break;
-        }
-    }
-
-    // const Error* e = compileFile(args[1]);
-    // if(hasError(e)) {
-    //     puts(e->text);
-    //     return 0;
+    // while(true) {
+    //     Token token = tokenizerNext(&t);
+    //     if(tokenizerHasError(&t)) {
+    //         puts(tokenizerGetError(&t));
+    //         break;
+    //     }
+    //     char buffer[256];
+    //     tokenizerPrintToken(&token, buffer, sizeof(buffer));
+    //     puts(buffer);
+    //     if(token.type == END) {
+    //         break;
+    //     }
     // }
-    // codeRun();
+    compileAndRun(&t);
+    tokenizerDestroy(&t);
 
     //  char line[256];
     //  while(true) {

+ 2 - 1
src/Tokenizer.c

@@ -228,7 +228,8 @@ Token tokenizerNext(Tokenizer* t) {
 }
 
 Token tokenizerPeek(Tokenizer* t) {
-    size_t index = bufferGetReadIndex(&t->buffer);
+    size_t index =
+        bufferIsEmpty(&t->buffer) ? 0 : bufferGetReadIndex(&t->buffer);
     Token token = tokenizerNext(t);
     bufferSetReadIndex(&t->buffer, index);
     return token;

+ 2 - 0
src/Values.c

@@ -1,5 +1,7 @@
 #include "Values.h"
 
+#include "Constants.h"
+
 static Value valueStack[MAX_VALUES];
 static size_t valueStackIndex = 0;
 

+ 1 - 1
src/Values.h

@@ -1,7 +1,7 @@
 #ifndef BASIC_VALUES_H
 #define BASIC_VALUES_H
 
-#include "Constants.h"
+#include "Types.h"
 
 typedef enum : u8 { INT64, CONSTANT_STRING } ValueType;