kjhammerle
/
lonely-tiger


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
							#include "Tokenizer.h"
#include "../exceptions/PreScriptException.h"
#include <ctype.h>
#include <sstream>

Tokenizer::Tokenizer()
{
}

void Tokenizer::tokenize(vector<unique_ptr<Token>>& tokens, vector<unique_ptr<istream>>& streams)
{
    Tokenizer::tokens = &tokens;
    Tokenizer::streams = &streams;
    
    for(streamIndex = 0; streamIndex < streams.size(); streamIndex++)
    {
        buffer = -1;
        line = 1;

        int c;
        while((c = next()) != -1)
        {
            handleChar(c);
        }
    }
    add(TokenType::EOF_TOKEN);
}

int Tokenizer::next()
{
    if(buffer != -1)
    {
        int r = buffer;
        buffer = -1;
        return r;
    }
    istream& in = *(*streams)[streamIndex].get();
    if(!in.good())
    {
        return -1;
    }
    
    int data = in.get();
    if((data & 0x80) != 0 && data != -1) // special char
    {
        if((data & 0x40) != 0) // this should always be true
        {
            if((data & 0x20) != 0) // 3 byte unicode
            {
                int a = in.get();
                int b = in.get();
                data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF);
            }
            else // 2 byte unicode
            {
                data = ((data & 0xFF) << 8) | (in.get() & 0xFF);
            }
        }
        else
        {
            // should not happen as unicode starts with 11
        }
    }
    return data;
}

int Tokenizer::peek()
{
    if(buffer == -1)
    {
        buffer = next();
        return buffer;
    }
    return buffer;
}

bool Tokenizer::next(char c)
{
    if(peek() == c)
    {
        next();
        return true;
    }
    return false;
}

void Tokenizer::add(TokenType type)
{
    tokens->push_back(unique_ptr<Token>(new Token(type, line)));
}

void Tokenizer::add(TokenType type, double data)
{
    tokens->push_back(unique_ptr<Token>(new DoubleToken(type, line, data)));
}

void Tokenizer::add(TokenType type, string data)
{
    tokens->push_back(unique_ptr<Token>(new StringToken(type, line, data)));
}

void Tokenizer::add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4)
{
    int peeked = peek();
    if(peeked == c)
    {
        next();
        if(peek() == '=')
        {
            next();
            add(t1);
        }
        else
        {
            add(t2);
        }
    }
    else if(peeked == '=')
    {
        next();
        add(t3);
    }
    else
    {
        add(t4);
    }
}

void Tokenizer::handleChar(int c)
{
    if(isLetter(c) || c == '_' || c == '.')
    {
        handleLiteral(c, TokenType::LITERAL);
    }
    else if(isDigit(c))
    {
        handleNumber(c);
    }
    else
    {
        handleSpecial(c);
    }
}

void Tokenizer::handleLiteral(int c, TokenType type)
{
    stringstream ss;
    ss << (char) c;

    while(true)
    {
        int data = peek();
        if(!isValidNamePart(data))
        {
            break;
        }
        ss << (char) data;
        next();
    }

    string s = ss.str();
    if(s == "if") { add(TokenType::IF); }
    else if(s == "if") { add(TokenType::IF); }
    else if(s == "else") { add(TokenType::ELSE); }
    else if(s == "elseif") { add(TokenType::ELSEIF); }
    else if(s == "while") { add(TokenType::WHILE); }
    else if(s == "try") { add(TokenType::TRY); }
    else if(s == "catch") { add(TokenType::CATCH); }
    else if(s == "for") { add(TokenType::FOR); }
    else if(s == "function") { add(TokenType::FUNCTION); }
    else if(s == "break") { add(TokenType::BREAK); }
    else if(s == "continue") { add(TokenType::CONTINUE); }
    else if(s == "return") { add(TokenType::RETURN); }
    else if(s == "true") { add(TokenType::TRUE); }
    else if(s == "false") { add(TokenType::FALSE); }
    else if(s == "null") { add(TokenType::NULL_TOKEN); }
    else { add(type, s); };
}

void Tokenizer::handleNumber(int c)
{
    double d = c - '0';

    while(true)
    {
        int data = peek();
        if(!isDigit(data))
        {
            if(data == '.')
            {
                next();
                double factor = 10;
                while(true)
                {
                    int data = peek();
                    if(!isDigit(data))
                    {
                        break;
                    }
                    d += (data - '0') / factor;
                    factor *= 10;
                    next();
                }
            }
            break;
        }
        d = (d * 10) + (data - '0');
        next();
    }

    add(NUMBER, d);
}

void Tokenizer::handleSpecial(int c)
{
    switch(c)
    {
        case ' ':
        case '\t':
        case '\r': break;
        case '\n': line++; break;
        case '"': handleString(); break;
        case '(': add(OPEN_BRACKET); break;
        case ')': add(CLOSE_BRACKET); break;
        case '[': add(OPEN_SQUARE_BRACKET); break;
        case ']': add(CLOSE_SQUARE_BRACKET); break;
        case '{': add(OPEN_CURVED_BRACKET); break;
        case '}': add(CLOSE_CURVED_BRACKET); break;
        case '$': handleLiteral(c, LITERAL); break;
        case '@': handleLiteral(c, LABEL); break;
        case ';': add(SEMICOLON); break;
        case ',': add(COMMA); break;
        case '~': add(BIT_INVERT); break;
        case '+': add(next('=') ? ADD_SET : (next('+') ? INC : ADD)); break;
        case '-': add(next('=') ? SUB_SET : (next('-') ? DEC : SUB)); break;
        case '!': add(next('=') ? NOT_EQUAL : INVERT); break;
        case '=': add(next('=') ? EQUAL : SET); break;
        case '*': add(next('=') ? MUL_SET : MUL); break;
        case '/': handleSlash(); break;
        case '%': add(next('=') ? MOD_SET : MOD); break;
        case '&': add(next('=') ? BIT_AND_SET : (next('&') ? AND : BIT_AND)); break; 
        case '|': add(next('=') ? BIT_OR_SET : (next('|') ? OR : BIT_OR)); break;
        case '^': add(next('=') ? BIT_XOR_SET : BIT_XOR); break;
        case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS); break;
        case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER); break;
        default: throw PreScriptException("unknown token " + c, line);
    }
}

void Tokenizer::handleString()
{
    stringstream ss;
    while(true)
    {
        int data = next();
        if(data == '"')
        {
            add(STRING, ss.str());
            break;
        }
        if(data == '\n')
        {
            line++;
        }
        if(data > 0xFFFF)
        {
            ss << (char) ((data & 0xFF0000) >> 16);
            ss << (char) ((data & 0xFF00) >> 8);
            ss << (char) (data & 0xFF);
        }
        else if(data > 0xFF)
        {
            ss << (char) ((data & 0xFF00) >> 8);
            ss << (char) (data & 0xFF);
        }
        else
        {
            ss << (char) data;
        }
    }
}

void Tokenizer::handleSlash()
{
    switch(peek())
    {
        case '/': 
            next();
            handleOneLineComment();
            break;
        case '*': 
            next();
            handleMultiLineComment();
            break;
        case '=': 
            next();
            add(DIV_SET);
            break;
        default:
            add(DIV);
    }
}

void Tokenizer::handleOneLineComment()
{
    while(true)
    {
        int data = next();
        if(data == -1 || data == '\n')
        {
            line++;
            break;
        }
    }
}

void Tokenizer::handleMultiLineComment()
{
    int first;
    int sec = -1;
    while(true)
    {
        first = sec;
        sec = next();
        if(sec == -1 || (first == '*' && sec == '/'))
        {
            break;
        }
        if(sec == '\n')
        {
            line++;
        }
    }
}

bool Tokenizer::isLetter(int c)
{
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

bool Tokenizer::isDigit(int c)
{
    return c >= '0' && c <= '9';
}

bool Tokenizer::isValidNamePart(int c)
{
    return isLetter(c) || isDigit(c) || c == '.' || c == '_';
}