Tokenizer.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. #include "Tokenizer.h"
  2. #include "../exceptions/PreScriptException.h"
  3. #include <ctype.h>
  4. #include <sstream>
  5. Tokenizer::Tokenizer()
  6. {
  7. }
  8. void Tokenizer::tokenize(vector<unique_ptr<Token>>& tokens, vector<unique_ptr<istream>>& streams)
  9. {
  10. Tokenizer::tokens = &tokens;
  11. Tokenizer::streams = &streams;
  12. for(streamIndex = 0; streamIndex < streams.size(); streamIndex++)
  13. {
  14. buffer = -1;
  15. line = 1;
  16. int c;
  17. while((c = next()) != -1)
  18. {
  19. handleChar(c);
  20. }
  21. }
  22. add(TokenType::EOF_TOKEN);
  23. }
  24. int Tokenizer::next()
  25. {
  26. if(buffer != -1)
  27. {
  28. int r = buffer;
  29. buffer = -1;
  30. return r;
  31. }
  32. istream& in = *(*streams)[streamIndex].get();
  33. if(!in.good())
  34. {
  35. return -1;
  36. }
  37. int data = in.get();
  38. if((data & 0x80) != 0 && data != -1) // special char
  39. {
  40. if((data & 0x40) != 0) // this should always be true
  41. {
  42. if((data & 0x20) != 0) // 3 byte unicode
  43. {
  44. int a = in.get();
  45. int b = in.get();
  46. data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF);
  47. }
  48. else // 2 byte unicode
  49. {
  50. data = ((data & 0xFF) << 8) | (in.get() & 0xFF);
  51. }
  52. }
  53. else
  54. {
  55. // should not happen as unicode starts with 11
  56. }
  57. }
  58. return data;
  59. }
  60. int Tokenizer::peek()
  61. {
  62. if(buffer == -1)
  63. {
  64. buffer = next();
  65. return buffer;
  66. }
  67. return buffer;
  68. }
  69. bool Tokenizer::next(char c)
  70. {
  71. if(peek() == c)
  72. {
  73. next();
  74. return true;
  75. }
  76. return false;
  77. }
  78. void Tokenizer::add(TokenType type)
  79. {
  80. tokens->push_back(unique_ptr<Token>(new Token(type, line)));
  81. }
  82. void Tokenizer::add(TokenType type, double data)
  83. {
  84. tokens->push_back(unique_ptr<Token>(new DoubleToken(type, line, data)));
  85. }
  86. void Tokenizer::add(TokenType type, string data)
  87. {
  88. tokens->push_back(unique_ptr<Token>(new StringToken(type, line, data)));
  89. }
  90. void Tokenizer::add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4)
  91. {
  92. int peeked = peek();
  93. if(peeked == c)
  94. {
  95. next();
  96. if(peek() == '=')
  97. {
  98. next();
  99. add(t1);
  100. }
  101. else
  102. {
  103. add(t2);
  104. }
  105. }
  106. else if(peeked == '=')
  107. {
  108. next();
  109. add(t3);
  110. }
  111. else
  112. {
  113. add(t4);
  114. }
  115. }
  116. void Tokenizer::handleChar(int c)
  117. {
  118. if(isLetter(c) || c == '_' || c == '.')
  119. {
  120. handleLiteral(c, TokenType::LITERAL);
  121. }
  122. else if(isDigit(c))
  123. {
  124. handleNumber(c);
  125. }
  126. else
  127. {
  128. handleSpecial(c);
  129. }
  130. }
  131. void Tokenizer::handleLiteral(int c, TokenType type)
  132. {
  133. stringstream ss;
  134. ss << (char) c;
  135. while(true)
  136. {
  137. int data = peek();
  138. if(!isValidNamePart(data))
  139. {
  140. break;
  141. }
  142. ss << (char) data;
  143. next();
  144. }
  145. string s = ss.str();
  146. if(s == "if") { add(TokenType::IF); }
  147. else if(s == "if") { add(TokenType::IF); }
  148. else if(s == "else") { add(TokenType::ELSE); }
  149. else if(s == "elseif") { add(TokenType::ELSEIF); }
  150. else if(s == "while") { add(TokenType::WHILE); }
  151. else if(s == "try") { add(TokenType::TRY); }
  152. else if(s == "catch") { add(TokenType::CATCH); }
  153. else if(s == "for") { add(TokenType::FOR); }
  154. else if(s == "function") { add(TokenType::FUNCTION); }
  155. else if(s == "break") { add(TokenType::BREAK); }
  156. else if(s == "continue") { add(TokenType::CONTINUE); }
  157. else if(s == "return") { add(TokenType::RETURN); }
  158. else if(s == "true") { add(TokenType::TRUE); }
  159. else if(s == "false") { add(TokenType::FALSE); }
  160. else if(s == "null") { add(TokenType::NULL_TOKEN); }
  161. else { add(type, s); };
  162. }
  163. void Tokenizer::handleNumber(int c)
  164. {
  165. double d = c - '0';
  166. while(true)
  167. {
  168. int data = peek();
  169. if(!isDigit(data))
  170. {
  171. if(data == '.')
  172. {
  173. next();
  174. double factor = 10;
  175. while(true)
  176. {
  177. int data = peek();
  178. if(!isDigit(data))
  179. {
  180. break;
  181. }
  182. d += (data - '0') / factor;
  183. factor *= 10;
  184. next();
  185. }
  186. }
  187. break;
  188. }
  189. d = (d * 10) + (data - '0');
  190. next();
  191. }
  192. add(NUMBER, d);
  193. }
  194. void Tokenizer::handleSpecial(int c)
  195. {
  196. switch(c)
  197. {
  198. case ' ':
  199. case '\t':
  200. case '\r': break;
  201. case '\n': line++; break;
  202. case '"': handleString(); break;
  203. case '(': add(OPEN_BRACKET); break;
  204. case ')': add(CLOSE_BRACKET); break;
  205. case '[': add(OPEN_SQUARE_BRACKET); break;
  206. case ']': add(CLOSE_SQUARE_BRACKET); break;
  207. case '{': add(OPEN_CURVED_BRACKET); break;
  208. case '}': add(CLOSE_CURVED_BRACKET); break;
  209. case '$': handleLiteral(c, LITERAL); break;
  210. case '@': handleLiteral(c, LABEL); break;
  211. case ';': add(SEMICOLON); break;
  212. case ',': add(COMMA); break;
  213. case '~': add(BIT_INVERT); break;
  214. case '+': add(next('=') ? ADD_SET : (next('+') ? INC : ADD)); break;
  215. case '-': add(next('=') ? SUB_SET : (next('-') ? DEC : SUB)); break;
  216. case '!': add(next('=') ? NOT_EQUAL : INVERT); break;
  217. case '=': add(next('=') ? EQUAL : SET); break;
  218. case '*': add(next('=') ? MUL_SET : MUL); break;
  219. case '/': handleSlash(); break;
  220. case '%': add(next('=') ? MOD_SET : MOD); break;
  221. case '&': add(next('=') ? BIT_AND_SET : (next('&') ? AND : BIT_AND)); break;
  222. case '|': add(next('=') ? BIT_OR_SET : (next('|') ? OR : BIT_OR)); break;
  223. case '^': add(next('=') ? BIT_XOR_SET : BIT_XOR); break;
  224. case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS); break;
  225. case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER); break;
  226. default: throw PreScriptException("unknown token " + c, line);
  227. }
  228. }
  229. void Tokenizer::handleString()
  230. {
  231. stringstream ss;
  232. while(true)
  233. {
  234. int data = next();
  235. if(data == '"')
  236. {
  237. add(STRING, ss.str());
  238. break;
  239. }
  240. if(data == '\n')
  241. {
  242. line++;
  243. }
  244. if(data > 0xFFFF)
  245. {
  246. ss << (char) ((data & 0xFF0000) >> 16);
  247. ss << (char) ((data & 0xFF00) >> 8);
  248. ss << (char) (data & 0xFF);
  249. }
  250. else if(data > 0xFF)
  251. {
  252. ss << (char) ((data & 0xFF00) >> 8);
  253. ss << (char) (data & 0xFF);
  254. }
  255. else
  256. {
  257. ss << (char) data;
  258. }
  259. }
  260. }
  261. void Tokenizer::handleSlash()
  262. {
  263. switch(peek())
  264. {
  265. case '/':
  266. next();
  267. handleOneLineComment();
  268. break;
  269. case '*':
  270. next();
  271. handleMultiLineComment();
  272. break;
  273. case '=':
  274. next();
  275. add(DIV_SET);
  276. break;
  277. default:
  278. add(DIV);
  279. }
  280. }
  281. void Tokenizer::handleOneLineComment()
  282. {
  283. while(true)
  284. {
  285. int data = next();
  286. if(data == -1 || data == '\n')
  287. {
  288. line++;
  289. break;
  290. }
  291. }
  292. }
  293. void Tokenizer::handleMultiLineComment()
  294. {
  295. int first;
  296. int sec = -1;
  297. while(true)
  298. {
  299. first = sec;
  300. sec = next();
  301. if(sec == -1 || (first == '*' && sec == '/'))
  302. {
  303. break;
  304. }
  305. if(sec == '\n')
  306. {
  307. line++;
  308. }
  309. }
  310. }
  311. bool Tokenizer::isLetter(int c)
  312. {
  313. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  314. }
  315. bool Tokenizer::isDigit(int c)
  316. {
  317. return c >= '0' && c <= '9';
  318. }
  319. bool Tokenizer::isValidNamePart(int c)
  320. {
  321. return isLetter(c) || isDigit(c) || c == '.' || c == '_';
  322. }