浏览代码

basic preprozessor define, tokenizer uses long jump for errors

Kajetan Johannes Hammerle 3 年之前
父节点
当前提交
bc537d34ef
共有 5 个文件被更改,包括 252 次插入93 次删除
  1. 8 0
      tests/pre/pre
  2. 5 0
      tests/pre/pre.out
  3. 141 7
      tokenizer/File.c
  4. 3 1
      tokenizer/File.h
  5. 95 85
      tokenizer/Tokenizer.c

+ 8 - 0
tests/pre/pre

@@ -0,0 +1,8 @@
+#define WUSI print 5;
+#define BAUM print 6;
+#define IF if(true) {
+
+void main() {
+    WUSI WUSI BAUM WUSI
+    IF BAUM }
+}

+ 5 - 0
tests/pre/pre.out

@@ -0,0 +1,5 @@
+5
+5
+6
+5
+6

+ 141 - 7
tokenizer/File.c

@@ -1,27 +1,161 @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "tokenizer/File.h"
+#include "utils/Utils.h"
 
+#define MAX_DEFINES 50
+
+static FileError fileError = NULL;
 static FILE* file = NULL;
+static int readIndex = 0;
+static int writeIndex = 0;
+static int fileSize = 0;
+static char* fileContent = NULL;
+
+typedef struct {
+    int nameLength;
+    const char* name;
+    const char* code;
+} Defines;
+
+static int defineIndex = 0;
+static Defines defines[MAX_DEFINES];
+static int defineReadIndex = 0;
+static const char* defineCode = NULL;
+
+static void fAdd(int c) {
+    if(writeIndex >= fileSize) {
+        fileSize *= 2;
+        fileContent = realloc(fileContent, fileSize);
+    }
+    fileContent[writeIndex++] = c;
+}
+
+static void fReset() {
+    readIndex = 0;
+    writeIndex = 0;
+    fileSize = 16;
+    fileContent = malloc(fileSize);
+    defineIndex = 0;
+    defineReadIndex = 0;
+    defineCode = NULL;
+}
 
-bool fOpen(const char* path) {
+void fOpen(const char* path, FileError fe) {
+    fileError = fe;
     file = fopen(path, "r");
-    return file == NULL;
+    if(file == NULL) {
+        fileError("cannot read file '%s'", path);
+        return;
+    }
+    fReset();
+    while(true) {
+        int c = fgetc(file);
+        if(c == '#') {
+            while(c != EOF && c != '\n') {
+                fAdd(c);
+                c = fgetc(file);
+            }
+            fAdd('\0');
+        }
+        fAdd(c);
+        if(c == EOF) {
+            break;
+        }
+    }
+    fAdd('\0');
+    fclose(file);
 }
 
 void fClose() {
-    fclose(file);
+    free(fileContent);
+}
+
+static void fSkipString(int skip) {
+    readIndex += skip;
+    while(fileContent[readIndex] == ' ') {
+        readIndex++;
+    }
+}
+
+static void fDefine() {
+    fSkipString(7);
+    const char* name = fileContent + readIndex;
+    while(true) {
+        if(fileContent[readIndex] == ' ') {
+            fileContent[readIndex] = '\0';
+            break;
+        } else if(!isLetter(fileContent[readIndex])) {
+            fileError("invalid define name '%s'", name);
+        }
+        readIndex++;
+    }
+    readIndex++;
+    if(defineIndex >= MAX_DEFINES) {
+        fileError("too many defines");
+    }
+    defines[defineIndex].nameLength = strlen(name);
+    defines[defineIndex].name = name;
+    defines[defineIndex].code = fileContent + readIndex;
+    defineIndex++;
+    fSkipString(strlen(fileContent + readIndex));
+}
+
+static bool fMatch(Defines* def, const char* code) {
+    return strncmp(def->name, code, def->nameLength) == 0;
+}
+
+static bool fPrepareChar() {
+    if(defineCode != NULL) {
+        if(defineCode[defineReadIndex] == '\0') {
+            defineCode = NULL;
+            defineReadIndex = 0;
+        } else {
+            return true;
+        }
+    }
+    if(!isLetter(fileContent[readIndex])) {
+        defineReadIndex = 0;
+    }
+    if(isLetter(fileContent[readIndex]) && defineReadIndex == 0) {
+        const char* replace = fileContent + readIndex;
+        for(int i = 0; i < defineIndex; i++) {
+            if(fMatch(defines + i, replace)) {
+                defineCode = defines[i].code;
+                readIndex += defines[i].nameLength;
+                return true;
+            }
+        }
+        defineReadIndex = -1;
+    }
+    if(fileContent[readIndex] == '#') {
+        const char* command = fileContent + readIndex + 1;
+        if(strncmp(command, "define", 6) == 0) {
+            fDefine();
+        } else {
+            fileError("unknown preprocessor command '%s'", command);
+        }
+    }
+    if(fileContent[readIndex] == '\0') {
+        readIndex++;
+    }
+    return false;
 }
 
 int fRead() {
-    return fgetc(file);
+    if(fPrepareChar()) {
+        return defineCode[defineReadIndex++];
+    }
+    return fileContent[readIndex++];
 }
 
 int fPeek() {
-    int c = fRead();
-    ungetc(c, file);
-    return c;
+    if(fPrepareChar()) {
+        return defineCode[defineReadIndex];
+    }
+    return fileContent[readIndex];
 }
 
 bool fReadIf(int c) {

+ 3 - 1
tokenizer/File.h

@@ -3,7 +3,9 @@
 
 #include <stdbool.h>
 
-bool fOpen(const char* path);
+typedef void (*FileError)(const char*, ...);
+
+void fOpen(const char* path, FileError fe);
 void fClose();
 
 int fRead();

+ 95 - 85
tokenizer/Tokenizer.c

@@ -1,4 +1,5 @@
 #include <limits.h>
+#include <setjmp.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -11,6 +12,7 @@
 #define TOKEN_BUFFER_LENGTH (1024 * 1024)
 #define ERROR_LENGTH 256
 
+static jmp_buf errorJump;
 static char tokenBuffer[TOKEN_BUFFER_LENGTH];
 static int writeIndex = 0;
 static int readIndex = 0;
@@ -22,21 +24,21 @@ static void tError(const char* format, ...) {
     va_start(args, format);
     vsnprintf(error, ERROR_LENGTH, format, args);
     va_end(args);
+    longjmp(errorJump, 0);
 }
 
-static bool tAdd(const void* data, int length) {
+static void tAdd(const void* data, int length) {
     if(writeIndex + length > TOKEN_BUFFER_LENGTH) {
         tError("the token buffer is too small");
-        return false;
     }
     memcpy(tokenBuffer + writeIndex, data, length);
     writeIndex += length;
-    return true;
 }
 
-static bool tAddToken(Token token) {
+static void tAddToken(Token token) {
     unsigned char c = token;
-    return tAdd(&c, 1) && tAdd(&line, sizeof(line));
+    tAdd(&c, 1);
+    tAdd(&line, sizeof(line));
 }
 
 static bool tReadTokens(void* dest, int length) {
@@ -48,26 +50,27 @@ static bool tReadTokens(void* dest, int length) {
     return true;
 }
 
-static bool tParseLiteral(int c) {
+static void tParseLiteral(int c) {
     int index = 1;
     char buffer[64];
     buffer[0] = c;
     while(isLetter(fPeek())) {
         if(index >= 63) {
             tError("literal is too long");
-            return false;
         }
         buffer[index++] = fRead();
     }
     buffer[index] = '\0';
     Token t = tFromName(buffer);
     if(t != T_END) {
-        return tAddToken(t);
+        tAddToken(t);
+    } else {
+        tAddToken(T_LITERAL);
+        tAdd(buffer, index + 1);
     }
-    return tAddToken(T_LITERAL) && tAdd(buffer, index + 1);
 }
 
-static bool tParseNumber(int c) {
+static void tParseNumber(int c) {
     int index = 1;
     char buffer[64];
     buffer[0] = c;
@@ -80,7 +83,6 @@ static bool tParseNumber(int c) {
             break;
         } else if(index >= 63) {
             tError("number is too long");
-            return false;
         }
         buffer[index++] = fRead();
     }
@@ -90,25 +92,23 @@ static bool tParseNumber(int c) {
         float f = strtof(buffer, &end);
         if(end[0] != '\0') {
             tError("invalid float on line %d", line);
-            return false;
         }
-        return tAddToken(T_CONST_FLOAT) && tAdd(&f, sizeof(float));
+        tAddToken(T_CONST_FLOAT);
+        tAdd(&f, sizeof(float));
     } else {
         char* end = NULL;
         long l = strtol(buffer, &end, 10);
         if(end[0] != '\0' || l > INT_MAX) {
             tError("invalid int on line %d", line);
-            return false;
         }
         int i = l;
-        return tAddToken(T_CONST_INT) && tAdd(&i, sizeof(int));
+        tAddToken(T_CONST_INT);
+        tAdd(&i, sizeof(int));
     }
 }
 
-static bool tAddString() {
-    if(!tAddToken(T_TEXT)) {
-        return false;
-    }
+static void tAddString() {
+    tAddToken(T_TEXT);
     while(true) {
         int c = fRead();
         if(c == '"') {
@@ -117,114 +117,120 @@ static bool tAddString() {
             switch(fRead()) {
                 case '"': c = '"'; break;
                 case '\\': c = '\\'; break;
-                default:
-                    tError("unknown escaped character at line %d", line);
-                    return false;
+                default: tError("unknown escaped character at line %d", line);
             }
         } else if(c == EOF) {
             tError("unclosed string starting at line %d", line);
-            return false;
-        }
-        if(!tAdd(&c, 1)) {
-            return false;
         }
+        tAdd(&c, 1);
     }
     char c = '\0';
-    return tAdd(&c, 1);
+    tAdd(&c, 1);
 }
 
-static bool tAddToken2(Token te, Token t) {
-    return fReadIf('=') ? tAddToken(te) : tAddToken(t);
+static void tAddToken2(Token te, Token t) {
+    if(fReadIf('=')) {
+        tAddToken(te);
+    } else {
+        tAddToken(t);
+    }
 }
 
-static bool tAddToken3(int c, Token tc, Token te, Token t) {
-    return fReadIf(c) ? tAddToken(tc) : tAddToken2(te, t);
+static void tAddToken3(int c, Token tc, Token te, Token t) {
+    if(fReadIf(c)) {
+        tAddToken(tc);
+    } else {
+        tAddToken2(te, t);
+    }
 }
 
-static bool tAddToken4(int c, Token tce, Token tc, Token te, Token t) {
-    return fReadIf(c) ? tAddToken2(tce, tc) : tAddToken2(te, t);
+static void tAddToken4(int c, Token tce, Token tc, Token te, Token t) {
+    if(fReadIf(c)) {
+        tAddToken2(tce, tc);
+    } else {
+        tAddToken2(te, t);
+    }
 }
 
-static bool tAddTokenMinus() {
-    return tAddToken3('-', T_DECREMENT, T_SUB_SET,
-                      fReadIf('>') ? T_ARROW : T_SUB);
+static void tAddTokenMinus() {
+    tAddToken3('-', T_DECREMENT, T_SUB_SET, fReadIf('>') ? T_ARROW : T_SUB);
 }
 
-static bool tLineComment() {
+static void tLineComment() {
     while(true) {
         int c = fRead();
         if(c == EOF || c == '\n') {
             line++;
-            return true;
+            return;
         }
     }
 }
 
-static bool tMultipleLineComment() {
+static void tMultipleLineComment() {
     while(true) {
         int c = fRead();
         if(c == EOF) {
             tError("unclosed comment at line %d", line);
-            return false;
         } else if(c == '\n') {
             line++;
         } else if(c == '*' && fReadIf('/')) {
-            return true;
+            return;
         }
     }
 }
 
-static bool tSlash() {
+static void tSlash() {
     if(fReadIf('/')) {
-        return tLineComment();
+        tLineComment();
     } else if(fReadIf('*')) {
-        return tMultipleLineComment();
+        tMultipleLineComment();
+    } else {
+        tAddToken2(T_DIV_SET, T_DIV);
     }
-    return tAddToken2(T_DIV_SET, T_DIV);
 }
 
-static bool tParseToken() {
-    int c = fRead();
-    if(c == EOF) {
-        return false;
-    } else if(isLetter(c)) {
-        return tParseLiteral(c);
+static void tParseToken(int c) {
+    if(isLetter(c)) {
+        tParseLiteral(c);
+        return;
     } else if(isNumber(c)) {
-        return tParseNumber(c);
+        tParseNumber(c);
+        return;
     }
     switch(c) {
-        case ' ': return true;
-        case '\n': line++; return true;
-        case '+': return tAddToken3('+', T_INCREMENT, T_ADD_SET, T_ADD);
-        case '-': return tAddTokenMinus();
-        case '*': return tAddToken2(T_MUL_SET, T_MUL);
-        case '/': return tSlash();
-        case '%': return tAddToken2(T_MOD_SET, T_MOD);
+        case ' ': return;
+        case '\n': line++; return;
+        case '+': tAddToken3('+', T_INCREMENT, T_ADD_SET, T_ADD); return;
+        case '-': tAddTokenMinus(); return;
+        case '*': tAddToken2(T_MUL_SET, T_MUL); return;
+        case '/': tSlash(); return;
+        case '%': tAddToken2(T_MOD_SET, T_MOD); return;
         case '<':
-            return tAddToken4('<', T_LEFT_SHIFT_SET, T_LEFT_SHIFT, T_LESS_EQUAL,
-                              T_LESS);
+            tAddToken4('<', T_LEFT_SHIFT_SET, T_LEFT_SHIFT, T_LESS_EQUAL,
+                       T_LESS);
+            return;
         case '>':
-            return tAddToken4('>', T_RIGHT_SHIFT_SET, T_RIGHT_SHIFT,
-                              T_GREATER_EQUAL, T_GREATER);
-        case '=': return tAddToken2(T_EQUAL, T_SET);
-        case '!': return tAddToken2(T_NOT_EQUAL, T_NOT);
-        case '&': return tAddToken3('&', T_AND, T_BIT_AND_SET, T_BIT_AND);
-        case '|': return tAddToken3('|', T_OR, T_BIT_OR_SET, T_BIT_OR);
-        case '~': return tAddToken(T_BIT_NOT);
-        case '^': return tAddToken2(T_BIT_XOR_SET, T_BIT_XOR);
-        case ',': return tAddToken(T_COMMA);
-        case ';': return tAddToken(T_SEMICOLON);
-        case '(': return tAddToken(T_OPEN_BRACKET);
-        case ')': return tAddToken(T_CLOSE_BRACKET);
-        case '{': return tAddToken(T_OPEN_CURVED_BRACKET);
-        case '}': return tAddToken(T_CLOSE_CURVED_BRACKET);
-        case '"': return tAddString();
-        case '.': return tAddToken(T_POINT);
-        case '[': return tAddToken(T_OPEN_SQUARE_BRACKET);
-        case ']': return tAddToken(T_CLOSE_SQUARE_BRACKET);
+            tAddToken4('>', T_RIGHT_SHIFT_SET, T_RIGHT_SHIFT, T_GREATER_EQUAL,
+                       T_GREATER);
+            return;
+        case '=': tAddToken2(T_EQUAL, T_SET); return;
+        case '!': tAddToken2(T_NOT_EQUAL, T_NOT); return;
+        case '&': tAddToken3('&', T_AND, T_BIT_AND_SET, T_BIT_AND); return;
+        case '|': tAddToken3('|', T_OR, T_BIT_OR_SET, T_BIT_OR); return;
+        case '~': tAddToken(T_BIT_NOT); return;
+        case '^': tAddToken2(T_BIT_XOR_SET, T_BIT_XOR); return;
+        case ',': tAddToken(T_COMMA); return;
+        case ';': tAddToken(T_SEMICOLON); return;
+        case '(': tAddToken(T_OPEN_BRACKET); return;
+        case ')': tAddToken(T_CLOSE_BRACKET); return;
+        case '{': tAddToken(T_OPEN_CURVED_BRACKET); return;
+        case '}': tAddToken(T_CLOSE_CURVED_BRACKET); return;
+        case '"': tAddString(); return;
+        case '.': tAddToken(T_POINT); return;
+        case '[': tAddToken(T_OPEN_SQUARE_BRACKET); return;
+        case ']': tAddToken(T_CLOSE_SQUARE_BRACKET); return;
     }
     tError("unknown character on line %d: %c", line, c);
-    return false;
 }
 
 static void tParseFile() {
@@ -232,16 +238,20 @@ static void tParseFile() {
     writeIndex = 0;
     line = 1;
     error[0] = '\0';
-    while(tParseToken()) {
+    while(true) {
+        int c = fRead();
+        if(c == EOF) {
+            return;
+        }
+        tParseToken(c);
     }
 }
 
 bool tTokenize(const char* path) {
-    if(fOpen(path)) {
-        tError("cannot read file '%s'", path);
-        return true;
+    if(!setjmp(errorJump)) {
+        fOpen(path, tError);
+        tParseFile();
     }
-    tParseFile();
     fClose();
     return error[0] != '\0';
 }