Ver Fonte

tokenizer writes tokens into char buffer, tokens are read from char
buffer, exceptions are replaced by returning bools

Kajetan Johannes Hammerle há 4 anos atrás
pai
commit
d1860ddd3d

+ 3 - 6
Main.cpp

@@ -3,14 +3,11 @@
 
 #include "test/Test.h"
 
-int main(int argc, char** argv) 
-{        
-    if(argc <= 0)
-    {
+int main(int argc, char** argv) {
+    if(argc <= 0) {
         return 0;
     }
-    if(argc >= 3 && strcmp(argv[1], "test") == 0)
-    {
+    if(argc >= 3 && strcmp(argv[1], "test") == 0) {
         Test::start(argv[2]);
     }
     return 0;

+ 0 - 5
exceptions/PreScriptException.cpp

@@ -1,5 +0,0 @@
-#include "exceptions/PreScriptException.h"
-
-PreScriptException::PreScriptException(const std::string& message, unsigned int line) : message(message), line(line)
-{
-}

+ 0 - 17
exceptions/PreScriptException.h

@@ -1,17 +0,0 @@
-#ifndef PRESCRIPTEXCEPTION_H
-#define PRESCRIPTEXCEPTION_H
-
-#include <exception>
-#include <string>
-
-class PreScriptException : public std::exception
-{
-public:
-    PreScriptException(const std::string& message, unsigned int line);
-    
-private:
-    std::string message;
-    int line;
-};
-
-#endif

+ 1 - 1
meson.build

@@ -1,6 +1,6 @@
 project('lonely tiger', 'cpp')
 
-src = ['Main.cpp', 'test/Test.cpp', 'test/TestLogger.cpp', 'tokenizer/Token.cpp', 'tokenizer/TokenType.cpp', 'tokenizer/Tokenizer.cpp', 'exceptions/PreScriptException.cpp']
+src = ['Main.cpp', 'test/Test.cpp', 'test/TestLogger.cpp', 'tokenizer/TokenType.cpp', 'tokenizer/Tokenizer.cpp', 'tokenizer/TokenStream.cpp']
 
 executable('lonely_tiger', 
     sources: src,

+ 31 - 48
test/Test.cpp

@@ -7,37 +7,30 @@
 
 #include "test/Test.h"
 #include "test/TestLogger.h"
-#include "tokenizer/Token.h"
 #include "tokenizer/Tokenizer.h"
-    
+#include "tokenizer/TokenStream.h"
+
 static unsigned int done = 0;
 static unsigned int tests = 0;
 static TestLogger logger;
+static bool run = true;
 
-static void forEachFile(const std::string& path, const std::string& ending, void (*f) (const std::string&, const std::string&))
-{
+static void forEachFile(const std::string& path, const std::string& ending, bool (*f) (const std::string&, const std::string&)) {
     DIR* dir;
     dir = opendir(path.c_str());
     struct dirent* entry = nullptr;
-    if(dir != nullptr)
-    {
-        while((entry = readdir(dir)) != nullptr)
-        {
-            if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
-            {
+    if(dir != nullptr) {
+        while(run && (entry = readdir(dir)) != nullptr) {
+            if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
                 continue;
             }
-            if(entry->d_type == DT_DIR) // Folder
-            {
+            if(entry->d_type == DT_DIR) {
                 forEachFile(path + "/" + entry->d_name, ending, f);
-            }
-            else if(entry->d_type == DT_REG) // File
-            {
-                if(strchr(entry->d_name, '.') == nullptr)
-                {
+            } else if(entry->d_type == DT_REG) {
+                if(strchr(entry->d_name, '.') == nullptr) {
                     std::string pathInputFile = path + "/" + entry->d_name;
                     std::string pathOutputFile = pathInputFile + ending;
-                    f(pathInputFile, pathOutputFile);
+                    run = f(pathInputFile, pathOutputFile);
                 }
             }
         }
@@ -45,46 +38,37 @@ static void forEachFile(const std::string& path, const std::string& ending, void
     }
 }
 
-static void testTokenizer(const char* path)
-{
+static void testTokenizer(const char* path) {
     done = 0;
     tests = 0;
-    forEachFile(path, ".tout", [](const std::string& input, const std::string& output)
-    {
+    run = true;
+    forEachFile(path, ".tout", [](const std::string& input, const std::string & output) {
         tests++;
-        
-        std::ifstream iStream;
+
+        Tokenizer::if32stream iStream;
         iStream.open(input);
-        
+
         std::ifstream oStream;
         oStream.open(output);
-        
-        if(!iStream.good() || !oStream.good())
-        {
-            return;
-        }
-        
-        std::vector<Token> tokens;
-        try
-        {
-            Tokenizer::tokenize(tokens, iStream);
+
+        if(!iStream.good() || !oStream.good()) {
+            return false;
         }
-        catch(std::exception& ex)
-        {
-            return;
+
+        TokenStream tokens;
+        if(Tokenizer::tokenize(tokens, iStream)) {
+            return false;
         }
-        
         logger.reset();
-        for(Token& token : tokens)
-        {
-            std::string s = token.toString();
-            logger.print(&s);
+        while(tokens.hasToken()) {
+            std::string s = tokens.nextTokenString();
+                    logger.print(&s);
         }
-        
-        if(logger.check(input, oStream))
-        {
+
+        if(logger.check(input, oStream)) {
             done++;
         }
+        return true;
     });
     std::cout << done << " / " << tests << " tokenizer tests succeeded" << std::endl;
 }
@@ -148,8 +132,7 @@ static void testTokenizer(const char* path)
 //        System.out.println(String.format("%d / %d output tests succeeded", done, tests));
 //}
 
-void Test::start(const char* path)
-{
+void Test::start(const char* path) {
     testTokenizer(path);
     //testCompiler();
     //testOutput();

+ 1 - 2
test/Test.h

@@ -1,8 +1,7 @@
 #ifndef TEST_H
 #define TEST_H
 
-namespace Test
-{
+namespace Test {
     void start(const char* path);
 }
 

+ 21 - 40
test/TestLogger.cpp

@@ -1,30 +1,23 @@
 #include "TestLogger.h"
 
-TestLogger::TestLogger()
-{
+TestLogger::TestLogger() {
 }
 
-TestLogger::~TestLogger()
-{
+TestLogger::~TestLogger() {
 }
 
-void TestLogger::print(const std::string* message, const std::exception* ex, 
-        const std::string* function, const std::string* scriptname, const Script* sc, int line)
-{
+void TestLogger::print(const std::string* message, const std::exception* ex,
+        const std::string* function, const std::string* scriptname, const Script* sc, int line) {
     (void) function;
     (void) scriptname;
     (void) sc;
     (void) line;
-    if(ex == nullptr)
-    {
-        if(message != nullptr)
-        {
+    if(ex == nullptr) {
+        if(message != nullptr) {
             int start = 0;
-            while(true)
-            {
+            while(true) {
                 int newLine = message->find('\n', start);
-                if(newLine == -1)
-                {
+                if(newLine == -1) {
                     output.push_back(message->substr(start, message->size() - start));
                     break;
                 }
@@ -32,42 +25,33 @@ void TestLogger::print(const std::string* message, const std::exception* ex,
                 start = newLine + 1;
             }
         }
-    }
-    else
-    {
+    } else {
         output.push_back(ex->what());
     }
 }
 
-void TestLogger::reset()
-{
+void TestLogger::reset() {
     output.clear();
 }
 
-bool TestLogger::check(const std::string& name, std::ifstream& check)
-{
+bool TestLogger::check(const std::string& name, std::ifstream& check) {
     std::vector<std::string> file;
-    
-    while(!check.eof())
-    {
+
+    while(!check.eof()) {
         std::string line;
         std::getline(check, line);
-        if(!check.eof())
-        {
+        if(!check.eof()) {
             file.push_back(line);
         }
     }
-    
-    if(file.size() != output.size())
-    {
+
+    if(file.size() != output.size()) {
         std::cout << file.size() << " " << output.size() << std::endl;
         printNoMatch(name, file, 0);
         return false;
     }
-    for(size_t i = 0; i < file.size(); i++)
-    {
-        if(file[i] != output[i])
-        {
+    for(size_t i = 0; i < file.size(); i++) {
+        if(file[i] != output[i]) {
             printNoMatch(name, file, i + 1);
             return false;
         }
@@ -75,17 +59,14 @@ bool TestLogger::check(const std::string& name, std::ifstream& check)
     return true;
 }
 
-void TestLogger::printNoMatch(const std::string& name, std::vector<std::string>& file, unsigned int line)
-{
+void TestLogger::printNoMatch(const std::string& name, std::vector<std::string>& file, unsigned int line) {
     std::cout << "error checking " << name << ", error starting at " << line << "\n";
     std::cout << "Expected ---------------------------------------------\n";
-    for(unsigned int i = 0; i < file.size(); i++)
-    {
+    for(unsigned int i = 0; i < file.size(); i++) {
         std::cout << file[i] << "\n";
     }
     std::cout << "Actual -----------------------------------------------\n";
-    for(unsigned int i = 0; i < output.size(); i++)
-    {
+    for(unsigned int i = 0; i < output.size(); i++) {
         std::cout << output[i] << "\n";
     }
     std::cout << "------------------------------------------------------\n";

+ 0 - 54
tokenizer/Token.cpp

@@ -1,54 +0,0 @@
-#include <sstream>
-
-#include "tokenizer/Token.h"
-
-Token::Token(TokenType tt, unsigned int line, const std::string& text, double number) : 
-    type(tt), line(line), text(text), number(number)
-{
-}
-
-Token::Token(TokenType tt, unsigned int line) : Token(tt, line, "", 0)
-{
-}
-
-Token::Token(TokenType tt, unsigned int line, double number) : Token(tt, line, "", number)
-{
-}
-
-Token::Token(TokenType tt, unsigned int line, const std::string& text) : Token(tt, line, text, 0)
-{
-}
-
-TokenType Token::getType() const
-{
-    return type;
-}
-
-int Token::getLine() const
-{
-    return line;
-}
-
-std::string Token::toString() const
-{
-    std::stringstream ss;
-    ss << '(';
-    ss << line;
-    ss << ", ";
-    ss << TokenTypeUtils::getEnumName(type);
-    if(type == TokenType::LITERAL || type == TokenType::STRING || type == TokenType::LABEL)
-    {
-        ss << ", \"";
-        ss << text;
-        ss << "\"";
-    }
-    else if(type == TokenType::NUMBER)
-    {
-        ss << ", ";
-        char buffer[20];
-        snprintf(buffer, 20, (number == (long) number) ? "%lg.0" : "%lg", number);
-        ss << buffer;
-    }
-    ss << ')';
-    return ss.str();
-}

+ 0 - 26
tokenizer/Token.h

@@ -1,26 +0,0 @@
-#ifndef TOKEN_H
-#define TOKEN_H
-
-#include "tokenizer/TokenType.h"
-
-class Token final
-{
-public:
-    Token(TokenType tt, unsigned int line, const std::string& text, double number);
-    Token(TokenType tt, unsigned int line);
-    Token(TokenType tt, unsigned int line, double number);
-    Token(TokenType tt, unsigned int line, const std::string& text);
-    
-    TokenType getType() const;
-    int getLine() const;
-    
-    std::string toString() const;
-    
-private:
-    TokenType type;
-    unsigned int line;
-    std::string text;
-    double number;
-};
-
-#endif

+ 97 - 0
tokenizer/TokenStream.cpp

@@ -0,0 +1,97 @@
+#include <cstring>
+
+#include "TokenStream.h"
+
+TokenStream::TokenStream() : nextToken(0) {
+}
+
+bool TokenStream::hasToken() const {
+    return nextToken < bytes.size();
+}
+
+std::string TokenStream::nextTokenString() {
+    std::string t = "(";
+    // line
+    unsigned int line = nextLine();
+    t += std::to_string(line);
+    t += ", ";
+    // tokentype
+    TokenType type = nextTokenType();
+    t += TokenTypeUtils::getEnumName(type);
+
+    if(type == TokenType::STRING || type == TokenType::LITERAL || type == TokenType::LABEL) {
+        t += ", \"";
+        t += nextString();
+        t += "\"";
+    }
+
+    if(type == TokenType::NUMBER) {
+        t += ", ";
+        double d = nextDouble();
+        char buffer[32];
+        snprintf(buffer, 32, (d == (long) d) ? "%lg.0" : "%lg", d);
+        t += buffer;
+    }
+
+    t += ")";
+    return t;
+}
+
+TokenType TokenStream::nextTokenType() {
+    TokenType type = TokenType::EOF_TOKEN;
+    read(&type, sizeof (TokenType));
+    return type;
+}
+
+unsigned int TokenStream::nextLine() {
+    unsigned int line = 0;
+    read(&line, sizeof (unsigned int));
+    return line;
+}
+
+std::string TokenStream::nextString() {
+    std::string text;
+    char c;
+    while(true) {
+        read(&c, 1);
+        if(c == '\0') {
+            break;
+        }
+        text += c;
+    }
+    return text;
+}
+
+double TokenStream::nextDouble() {
+    double d;
+    read(&d, sizeof (double));
+    return d;
+}
+
+void TokenStream::write(const void* data, size_t length) {
+    const char* chars = reinterpret_cast<const char*> (data);
+    for(size_t i = 0; i < length; i++) {
+        bytes.push_back(chars[i]);
+    }
+}
+
+void TokenStream::read(void* data, size_t length) {
+    memcpy(data, bytes.data() + nextToken, length);
+    nextToken += length;
+}
+
+void TokenStream::add(TokenType type, unsigned int line) {
+    write(&line, sizeof (unsigned int));
+    write(&type, sizeof (TokenType));
+}
+
+void TokenStream::add(TokenType type, unsigned int line, double d) {
+    add(type, line);
+    write(&d, sizeof (double));
+}
+
+void TokenStream::add(TokenType type, unsigned int line, const std::string& text) {
+    add(type, line);
+    write(text.data(), text.length());
+    write("\0", 1);
+}

+ 33 - 0
tokenizer/TokenStream.h

@@ -0,0 +1,33 @@
+#ifndef TOKENSTREAM_H
+#define TOKENSTREAM_H
+
+#include <vector>
+
+#include "tokenizer/TokenType.h"
+
+class TokenStream final {
+public:
+    TokenStream();
+
+    bool hasToken() const;
+    std::string nextTokenString();
+    
+    TokenType nextTokenType();
+    unsigned int nextLine();
+    std::string nextString();
+    double nextDouble();
+
+    void add(TokenType type, unsigned int line);
+    void add(TokenType type, unsigned int line, double d);
+    void add(TokenType type, unsigned int line, const std::string& text);
+
+private:
+    void write(const void* data, size_t length);
+    void read(void* data, size_t length);
+
+private:
+    size_t nextToken;
+    std::vector<char> bytes;
+};
+
+#endif

+ 4 - 8
tokenizer/TokenType.cpp

@@ -1,9 +1,7 @@
 #include "TokenType.h"
 
-std::string TokenTypeUtils::getName(TokenType tt)
-{
-    switch(tt)
-    {
+std::string TokenTypeUtils::getName(TokenType tt) {
+    switch(tt) {
         case NUMBER: return "number";
         case STRING: return "string";
         case LITERAL: return "literal";
@@ -68,10 +66,8 @@ std::string TokenTypeUtils::getName(TokenType tt)
     return "Unknown TokenType";
 }
 
-std::string TokenTypeUtils::getEnumName(TokenType tt)
-{
-    switch(tt)
-    {
+std::string TokenTypeUtils::getEnumName(TokenType tt) {
+    switch(tt) {
         case NUMBER: return "NUMBER";
         case STRING: return "STRING";
         case LITERAL: return "LITERAL";

+ 231 - 285
tokenizer/Tokenizer.cpp

@@ -1,220 +1,198 @@
 #include <sstream>
 
 #include "tokenizer/Tokenizer.h"
-#include "exceptions/PreScriptException.h"
 
 static unsigned int line = 1;
-static std::vector<Token>* tokens = nullptr;
-static std::istream* input = nullptr;
-static int buffer = -1;
+static TokenStream* tokens = nullptr;
+static Tokenizer::i32stream* input = nullptr;
+static char32_t buffer = 0;
 
-static bool isLetter(int c)
-{
+static void onError(const std::string& message, unsigned int line) {
+    std::cout << message << " Line: " << line << std::endl;
+}
+
+static void convertChar(char32_t c, char* buffer) {
+    if(c <= 0x7F) {
+        buffer[0] = (char) c;
+        buffer[1] = '\0';
+    } else if(c <= 0x7FF) {
+        buffer[0] = (char) (0xC0 | ((c >> 6) & 0x1F));
+        buffer[1] = (char) (0x80 | ((c >> 0) & 0x3F));
+        buffer[2] = '\0';
+    } else if(c <= 0xFFFF) {
+        buffer[0] = (char) (0xE0 | ((c >> 12) & 0x0F));
+        buffer[1] = (char) (0x80 | ((c >> 6) & 0x3F));
+        buffer[2] = (char) (0x80 | ((c >> 0) & 0x3F));
+        buffer[3] = '\0';
+    } else {
+        buffer[0] = (char) (0xF0 | ((c >> 18) & 0x07));
+        buffer[1] = (char) (0x80 | ((c >> 12) & 0x3F));
+        buffer[2] = (char) (0x80 | ((c >> 6) & 0x3F));
+        buffer[3] = (char) (0x80 | ((c >> 0) & 0x3F));
+        buffer[4] = '\0';
+    }
+}
+
+static bool isLetter(char32_t c) {
     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 }
 
-static bool isDigit(int c)
-{
+static bool isDigit(char32_t c) {
     return c >= '0' && c <= '9';
 }
 
-static bool isValidNamePart(int c)
-{
-    return isLetter(c) || isDigit(c) || c == '.' || c == '_';
+static bool isValidNameStart(char32_t c) {
+    return isLetter(c) || c == '.' || c == '_';
 }
 
-static int next()
-{
-    if(buffer != -1)
-    {
-        int r = buffer;
-        buffer = -1;
-        return r;
-    }
-    int data = input->get();
-    if(!input->good())
-    {
-        return -1;
-    }
-    if((data & 0x80) != 0 && data != -1) // special char
-    {
-        if((data & 0x40) != 0) // this should always be true
-        {
-            if((data & 0x20) != 0) // 3 byte unicode
-            {
-                int a = input->get();
-                int b = input->get();
-                data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF);
-            }
-            else // 2 byte unicode
-            {
-                data = ((data & 0xFF) << 8) | (input->get() & 0xFF);
-            }
-        }
-        else
-        {
-            // should not happen as unicode starts with 11
-        }
-    }
-    return data;
+static bool isValidNamePart(char32_t c) {
+    return isDigit(c) || isValidNameStart(c);
 }
 
-static int peek()
-{
-    if(buffer == -1)
-    {
-        buffer = next();
-        return buffer;
+static bool next(char32_t& c) {
+    if(buffer != 0) {
+        c = buffer;
+        buffer = 0;
+        return true;
     }
-    return buffer;
+    c = input->get();
+    return input->good();
 }
 
-static bool next(char c)
-{
-    if(peek() == c)
-    {
-        next();
+static bool peek(char32_t& c) {
+    if(buffer != 0 || next(buffer)) {
+        c = buffer;
         return true;
     }
     return false;
 }
 
-static void add(TokenType type)
-{
-    tokens->push_back(Token(type, line));
+static bool nextIf(char32_t c) {
+    char32_t nextChar;
+    if(peek(nextChar) && c == nextChar) {
+        next(nextChar);
+        return true;
+    }
+    return false;
 }
 
-static void add(TokenType type, double number)
-{
-    tokens->push_back(Token(type, line, number));
+static void add(TokenType type) {
+    tokens->add(type, line);
 }
 
-static void add(TokenType type, const std::string& text)
-{
-    tokens->push_back(Token(type, line, text));
+static void add(TokenType type, const std::string& text) {
+    tokens->add(type, line, text);
 }
 
-static void add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4)
-{
-    int p = peek();
-    if(p == c)
-    {
-        next();
-        if(peek() == '=')
-        {
-            next();
-            add(t1);
+static TokenType chooseTokenType(char c, TokenType aCharEqual, TokenType aChar, TokenType aEqual, TokenType other) {
+    if(nextIf(c)) {
+        if(nextIf('=')) {
+            return aCharEqual;
         }
-        else
-        {
-            add(t2);
-        }
-    }
-    else if(p == '=')
-    {
-        next();
-        add(t3);
-    }
-    else
-    {
-        add(t4);
+        return aChar;
+    } else if(nextIf('=')) {
+        return aEqual;
     }
+    return other;
 }
 
-static void handleLiteral(int c, TokenType type)
-{
+static bool handleLiteral(char32_t c, TokenType type) {
     std::stringstream sBuilder;
     sBuilder << (char) c;
 
-    while(true)
-    {
-        int data = peek();
-        if(!isValidNamePart(data))
-        {
+    while(true) {
+        char32_t data;
+        if(!peek(data) || !isValidNamePart(data)) {
             break;
         }
         sBuilder << (char) data;
-        next();
+        next(data);
     }
 
     std::string s = sBuilder.str();
-    if(s == "if") { add(TokenType::IF); }
-    else if(s == "if") { add(TokenType::IF); }
-    else if(s == "else") { add(TokenType::ELSE); }
-    else if(s == "elseif") { add(TokenType::ELSEIF); }
-    else if(s == "while") { add(TokenType::WHILE); }
-    else if(s == "try") { add(TokenType::TRY); }
-    else if(s == "catch") { add(TokenType::CATCH); }
-    else if(s == "for") { add(TokenType::FOR); }
-    else if(s == "function") { add(TokenType::FUNCTION); }
-    else if(s == "break") { add(TokenType::BREAK); }
-    else if(s == "continue") { add(TokenType::CONTINUE); }
-    else if(s == "return") { add(TokenType::RETURN); }
-    else if(s == "true") { add(TokenType::TRUE); }
-    else if(s == "false") { add(TokenType::FALSE); }
-    else if(s == "null") { add(TokenType::NULL_TOKEN); }
-    else { add(type, s); };
-
+    if(s == "if") {
+        add(TokenType::IF);
+    } else if(s == "if") {
+        add(TokenType::IF);
+    } else if(s == "else") {
+        add(TokenType::ELSE);
+    } else if(s == "elseif") {
+        add(TokenType::ELSEIF);
+    } else if(s == "while") {
+        add(TokenType::WHILE);
+    } else if(s == "try") {
+        add(TokenType::TRY);
+    } else if(s == "catch") {
+        add(TokenType::CATCH);
+    } else if(s == "for") {
+        add(TokenType::FOR);
+    } else if(s == "function") {
+        add(TokenType::FUNCTION);
+    } else if(s == "break") {
+        add(TokenType::BREAK);
+    } else if(s == "continue") {
+        add(TokenType::CONTINUE);
+    } else if(s == "return") {
+        add(TokenType::RETURN);
+    } else if(s == "true") {
+        add(TokenType::TRUE);
+    } else if(s == "false") {
+        add(TokenType::FALSE);
+    } else if(s == "null") {
+        add(TokenType::NULL_TOKEN);
+    } else {
+        add(type, s);
+    }
+    return false;
 }
 
-static void handleNumber(int c)
-{
-    double d = c - '0';
-    while(true)
-    {
-        int data = peek();
-        if(!isDigit(data))
-        {
-            if(data == '.')
-            {
-                next();
-                double factor = 10;
-                while(true)
-                {
-                    int data = peek();
-                    if(!isDigit(data))
-                    {
-                        break;
-                    }
-                    d += (data - '0') / factor;
-                    factor *= 10;
-                    next();
-                }
+static bool handleNumber(char32_t c) {
+    double number = c - '0';
+    char32_t data;
+    while(peek(data)) {
+        if(!isDigit(data)) {
+            if(data != '.') {
+                break;
+            }
+            next(data);
+            double factor = 10;
+            while(peek(data) && isDigit(data)) {
+                number += (data - '0') / factor;
+                factor *= 10;
+                next(data);
             }
             break;
         }
-        d = (d * 10) + (data - '0');
-        next();
+        number = (number * 10) + (data - '0');
+        next(data);
     }
-
-    add(NUMBER, d);
+    tokens->add(NUMBER, line, number);
+    return false;
 }
 
-static void handleString()
-{
+static bool handleString() {
     std::stringstream ss;
-    int oldLine = line;
-    while(true)
-    {
-        int data = next();
-        if(data == -1)
-        {
-            throw PreScriptException("non closed string literal", oldLine);
+    unsigned int oldLine = line;
+    while(true) {
+        char32_t data;
+        if(!next(data)) {
+            onError("non closed string literal", oldLine);
+            return true;
         }
-        if(data == '"')
-        {
+        if(data == '"') {
             add(STRING, ss.str());
-            break;
+            return false;
         }
-        if(data == '\n')
-        {
+        if(data == '\n') {
             line++;
         }
-        if(data == '\\')
-        {
-            int escape = next();
-            switch(escape)
-            {
+        if(data == '\\') {
+            char32_t escape;
+            if(!next(escape)) {
+                onError("missing escaped character", line);
+                return true;
+            }
+            switch(escape) {
                 case 'n': data = '\n';
                     break;
                 case '\\': data = '\\';
@@ -222,168 +200,136 @@ static void handleString()
                 case '"': data = '"';
                     break;
                 default:
-                    throw PreScriptException("invalid escaped character", line);
+                    onError("invalid escaped character", line);
+                    return true;
             }
         }
-        if(data > 0xFFFF)
-        {
-            ss << (char) ((data & 0xFF0000) >> 16);
-            ss << (char) ((data & 0xFF00) >> 8);
-            ss << (char) (data & 0xFF);
-        }
-        else if(data > 0xFF)
-        {
-            ss << (char) ((data & 0xFF00) >> 8);
-            ss << (char) (data & 0xFF);
-        }
-        else
-        {
-            ss << (char) data;
-        }
+        char buffer[5];
+        convertChar(data, buffer);
+        ss << buffer;
     }
 }
 
-static void handleOneLineComment()
-{
-    while(true)
-    {
-        int data = next();
-        if(data == -1 || data == '\n')
-        {
-            line++;
-            break;
-        }
-    }
+static bool handleOneLineComment() {
+    char32_t data;
+    while(next(data) && data != '\n');
+    line++;
+    return false;
 }
 
-static void handleMultiLineComment()
-{
-    int first;
-    int sec = -1;
-    while(true)
-    {
+static bool handleMultiLineComment() {
+    char32_t first;
+    char32_t sec = 0;
+    unsigned int oldLine = line;
+    while(true) {
         first = sec;
-        sec = next();
-        if(sec == -1 || (first == '*' && sec == '/'))
-        {
-            break;
+        if(!next(sec)) {
+            onError("unclosed multiline comment", oldLine);
+            return true;
         }
-        if(sec == '\n')
-        {
-            line++;
+        if(first == '*' && sec == '/') {
+            return false;
         }
+        line += (sec == '\n');
     }
 }
 
-static void handleSlash()
-{
-    switch(peek())
-    {
-        case '/':
-            next();
-            handleOneLineComment();
-            break;
-        case '*':
-            next();
-            handleMultiLineComment();
-            break;
-        case '=':
-            next();
-            add(DIV_SET);
-            break;
-        default:
-            add(DIV);
+static bool handleSlash() {
+    if(nextIf('/')) {
+        return handleOneLineComment();
+    } else if(nextIf('*')) {
+        return handleMultiLineComment();
+    } else if(nextIf('=')) {
+        add(DIV_SET);
+        return false;
     }
+    add(DIV);
+    return false;
 }
 
-static void handleSpecial(int c)
-{
-    switch(c)
-    {
+static bool handleSpecial(char32_t c) {
+    switch(c) {
         case ' ':
         case '\t':
-        case '\r': break;
+        case '\r':
+            return false;
         case '\n': line++;
-            break;
-        case '"': handleString();
-            break;
+            return false;
+        case '"':
+            return handleString();
         case '(': add(OPEN_BRACKET);
-            break;
+            return false;
         case ')': add(CLOSE_BRACKET);
-            break;
+            return false;
         case '[': add(OPEN_SQUARE_BRACKET);
-            break;
+            return false;
         case ']': add(CLOSE_SQUARE_BRACKET);
-            break;
+            return false;
         case '{': add(OPEN_CURVED_BRACKET);
-            break;
+            return false;
         case '}': add(CLOSE_CURVED_BRACKET);
-            break;
-        case '$': handleLiteral(c, LITERAL);
-            break;
-        case '@': handleLiteral(c, LABEL);
-            break;
+            return false;
+        case '$':
+            return handleLiteral(c, LITERAL);
+        case '@':
+            return handleLiteral(c, LABEL);
         case ';': add(SEMICOLON);
-            break;
+            return false;
         case ',': add(COMMA);
-            break;
+            return false;
         case '~': add(BIT_INVERT);
-            break;
-        case '+': add(next('=') ? ADD_SET: (next('+') ? INC: ADD));
-            break;
-        case '-': add(next('=') ? SUB_SET: (next('-') ? DEC: SUB));
-            break;
-        case '!': add(next('=') ? NOT_EQUAL: INVERT);
-            break;
-        case '=': add(next('=') ? EQUAL: SET);
-            break;
-        case '*': add(next('=') ? MUL_SET: MUL);
-            break;
-        case '/': handleSlash();
-            break;
-        case '%': add(next('=') ? MOD_SET: MOD);
-            break;
-        case '&': add(next('=') ? BIT_AND_SET: (next('&') ? AND: BIT_AND));
-            break;
-        case '|': add(next('=') ? BIT_OR_SET: (next('|') ? OR: BIT_OR));
-            break;
-        case '^': add(next('=') ? BIT_XOR_SET: BIT_XOR);
-            break;
-        case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS);
-            break;
-        case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER);
-            break;
-        default: throw PreScriptException("unknown token " + c, line);
+            return false;
+        case '+': add(nextIf('=') ? ADD_SET: (nextIf('+') ? INC: ADD));
+            return false;
+        case '-': add(nextIf('=') ? SUB_SET: (nextIf('-') ? DEC: SUB));
+            return false;
+        case '!': add(nextIf('=') ? NOT_EQUAL: INVERT);
+            break;
+        case '=': add(nextIf('=') ? EQUAL: SET);
+            return false;
+        case '*': add(nextIf('=') ? MUL_SET: MUL);
+            return false;
+        case '/':
+            return handleSlash();
+        case '%': add(nextIf('=') ? MOD_SET: MOD);
+            return false;
+        case '&': add(nextIf('=') ? BIT_AND_SET: (nextIf('&') ? AND: BIT_AND));
+            return false;
+        case '|': add(nextIf('=') ? BIT_OR_SET: (nextIf('|') ? OR: BIT_OR));
+            return false;
+        case '^': add(nextIf('=') ? BIT_XOR_SET: BIT_XOR);
+            return false;
+        case '<': add(chooseTokenType('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS));
+            return false;
+        case '>': add(chooseTokenType('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER));
+            return false;
     }
+    char buffer[5];
+    convertChar(c, buffer);
+    onError(std::string("unknown token '") + buffer + "'", line);
+    return true;
 }
 
-static void handleChar(int c)
-{
-    if(isLetter(c) || c == '_' || c == '.')
-    {
-        handleLiteral(c, TokenType::LITERAL);
-    }
-    else if(isDigit(c))
-    {
-        handleNumber(c);
-    }
-    else
-    {
-        handleSpecial(c);
+static bool handleChar(char32_t c) {
+    if(isValidNameStart(c)) {
+        return handleLiteral(c, TokenType::LITERAL);
+    } else if(isDigit(c)) {
+        return handleNumber(c);
     }
+    return handleSpecial(c);
 }
 
-void Tokenizer::tokenize(std::vector<Token>& inTokens, std::istream& inInput)
-{
+bool Tokenizer::tokenize(TokenStream& inTokens, i32stream& inInput) {
     tokens = &inTokens;
     input = &inInput;
-    
     line = 1;
-    buffer = -1;
-    int c;
-    while((c = next()) != -1)
-    {
-        handleChar(c);
+    buffer = 0;
+    char32_t c;
+    while(next(c)) {
+        if(handleChar(c)) {
+            return true;
+        }
     }
     add(EOF_TOKEN);
+    return false;
 }

+ 5 - 4
tokenizer/Tokenizer.h

@@ -4,11 +4,12 @@
 #include <iostream>
 #include <vector>
 
-#include "tokenizer/Token.h"
+#include "tokenizer/TokenStream.h"
 
-namespace Tokenizer
-{
-    void tokenize(std::vector<Token>& tokens, std::istream& input);
+namespace Tokenizer {
+    typedef std::basic_istream<char32_t> i32stream;
+    typedef std::basic_ifstream<char32_t> if32stream;
+    bool tokenize(TokenStream& tokens, i32stream& input);
 }
 
 #endif