Tokenizer.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. #include <sstream>
  2. #include "tokenizer/Tokenizer.h"
  3. #include "exceptions/PreScriptException.h"
  4. static unsigned int line = 1;
  5. static std::vector<Token>* tokens = nullptr;
  6. static std::istream* input = nullptr;
  7. static int buffer = -1;
  8. static bool isLetter(int c)
  9. {
  10. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  11. }
  12. static bool isDigit(int c)
  13. {
  14. return c >= '0' && c <= '9';
  15. }
  16. static bool isValidNamePart(int c)
  17. {
  18. return isLetter(c) || isDigit(c) || c == '.' || c == '_';
  19. }
  20. static int next()
  21. {
  22. if(buffer != -1)
  23. {
  24. int r = buffer;
  25. buffer = -1;
  26. return r;
  27. }
  28. int data = input->get();
  29. if(!input->good())
  30. {
  31. return -1;
  32. }
  33. if((data & 0x80) != 0 && data != -1) // special char
  34. {
  35. if((data & 0x40) != 0) // this should always be true
  36. {
  37. if((data & 0x20) != 0) // 3 byte unicode
  38. {
  39. int a = input->get();
  40. int b = input->get();
  41. data = ((data & 0xFF) << 16) | ((a & 0xFF) << 8) | (b & 0xFF);
  42. }
  43. else // 2 byte unicode
  44. {
  45. data = ((data & 0xFF) << 8) | (input->get() & 0xFF);
  46. }
  47. }
  48. else
  49. {
  50. // should not happen as unicode starts with 11
  51. }
  52. }
  53. return data;
  54. }
  55. static int peek()
  56. {
  57. if(buffer == -1)
  58. {
  59. buffer = next();
  60. return buffer;
  61. }
  62. return buffer;
  63. }
  64. static bool next(char c)
  65. {
  66. if(peek() == c)
  67. {
  68. next();
  69. return true;
  70. }
  71. return false;
  72. }
  73. static void add(TokenType type)
  74. {
  75. tokens->push_back(Token(type, line));
  76. }
  77. static void add(TokenType type, double number)
  78. {
  79. tokens->push_back(Token(type, line, number));
  80. }
  81. static void add(TokenType type, const std::string& text)
  82. {
  83. tokens->push_back(Token(type, line, text));
  84. }
  85. static void add(char c, TokenType t1, TokenType t2, TokenType t3, TokenType t4)
  86. {
  87. int p = peek();
  88. if(p == c)
  89. {
  90. next();
  91. if(peek() == '=')
  92. {
  93. next();
  94. add(t1);
  95. }
  96. else
  97. {
  98. add(t2);
  99. }
  100. }
  101. else if(p == '=')
  102. {
  103. next();
  104. add(t3);
  105. }
  106. else
  107. {
  108. add(t4);
  109. }
  110. }
  111. static void handleLiteral(int c, TokenType type)
  112. {
  113. std::stringstream sBuilder;
  114. sBuilder << (char) c;
  115. while(true)
  116. {
  117. int data = peek();
  118. if(!isValidNamePart(data))
  119. {
  120. break;
  121. }
  122. sBuilder << (char) data;
  123. next();
  124. }
  125. std::string s = sBuilder.str();
  126. if(s == "if") { add(TokenType::IF); }
  127. else if(s == "if") { add(TokenType::IF); }
  128. else if(s == "else") { add(TokenType::ELSE); }
  129. else if(s == "elseif") { add(TokenType::ELSEIF); }
  130. else if(s == "while") { add(TokenType::WHILE); }
  131. else if(s == "try") { add(TokenType::TRY); }
  132. else if(s == "catch") { add(TokenType::CATCH); }
  133. else if(s == "for") { add(TokenType::FOR); }
  134. else if(s == "function") { add(TokenType::FUNCTION); }
  135. else if(s == "break") { add(TokenType::BREAK); }
  136. else if(s == "continue") { add(TokenType::CONTINUE); }
  137. else if(s == "return") { add(TokenType::RETURN); }
  138. else if(s == "true") { add(TokenType::TRUE); }
  139. else if(s == "false") { add(TokenType::FALSE); }
  140. else if(s == "null") { add(TokenType::NULL_TOKEN); }
  141. else { add(type, s); };
  142. }
  143. static void handleNumber(int c)
  144. {
  145. double d = c - '0';
  146. while(true)
  147. {
  148. int data = peek();
  149. if(!isDigit(data))
  150. {
  151. if(data == '.')
  152. {
  153. next();
  154. double factor = 10;
  155. while(true)
  156. {
  157. int data = peek();
  158. if(!isDigit(data))
  159. {
  160. break;
  161. }
  162. d += (data - '0') / factor;
  163. factor *= 10;
  164. next();
  165. }
  166. }
  167. break;
  168. }
  169. d = (d * 10) + (data - '0');
  170. next();
  171. }
  172. add(NUMBER, d);
  173. }
  174. static void handleString()
  175. {
  176. std::stringstream ss;
  177. int oldLine = line;
  178. while(true)
  179. {
  180. int data = next();
  181. if(data == -1)
  182. {
  183. throw PreScriptException("non closed string literal", oldLine);
  184. }
  185. if(data == '"')
  186. {
  187. add(STRING, ss.str());
  188. break;
  189. }
  190. if(data == '\n')
  191. {
  192. line++;
  193. }
  194. if(data == '\\')
  195. {
  196. int escape = next();
  197. switch(escape)
  198. {
  199. case 'n': data = '\n';
  200. break;
  201. case '\\': data = '\\';
  202. break;
  203. case '"': data = '"';
  204. break;
  205. default:
  206. throw PreScriptException("invalid escaped character", line);
  207. }
  208. }
  209. if(data > 0xFFFF)
  210. {
  211. ss << (char) ((data & 0xFF0000) >> 16);
  212. ss << (char) ((data & 0xFF00) >> 8);
  213. ss << (char) (data & 0xFF);
  214. }
  215. else if(data > 0xFF)
  216. {
  217. ss << (char) ((data & 0xFF00) >> 8);
  218. ss << (char) (data & 0xFF);
  219. }
  220. else
  221. {
  222. ss << (char) data;
  223. }
  224. }
  225. }
  226. static void handleOneLineComment()
  227. {
  228. while(true)
  229. {
  230. int data = next();
  231. if(data == -1 || data == '\n')
  232. {
  233. line++;
  234. break;
  235. }
  236. }
  237. }
  238. static void handleMultiLineComment()
  239. {
  240. int first;
  241. int sec = -1;
  242. while(true)
  243. {
  244. first = sec;
  245. sec = next();
  246. if(sec == -1 || (first == '*' && sec == '/'))
  247. {
  248. break;
  249. }
  250. if(sec == '\n')
  251. {
  252. line++;
  253. }
  254. }
  255. }
  256. static void handleSlash()
  257. {
  258. switch(peek())
  259. {
  260. case '/':
  261. next();
  262. handleOneLineComment();
  263. break;
  264. case '*':
  265. next();
  266. handleMultiLineComment();
  267. break;
  268. case '=':
  269. next();
  270. add(DIV_SET);
  271. break;
  272. default:
  273. add(DIV);
  274. }
  275. }
  276. static void handleSpecial(int c)
  277. {
  278. switch(c)
  279. {
  280. case ' ':
  281. case '\t':
  282. case '\r': break;
  283. case '\n': line++;
  284. break;
  285. case '"': handleString();
  286. break;
  287. case '(': add(OPEN_BRACKET);
  288. break;
  289. case ')': add(CLOSE_BRACKET);
  290. break;
  291. case '[': add(OPEN_SQUARE_BRACKET);
  292. break;
  293. case ']': add(CLOSE_SQUARE_BRACKET);
  294. break;
  295. case '{': add(OPEN_CURVED_BRACKET);
  296. break;
  297. case '}': add(CLOSE_CURVED_BRACKET);
  298. break;
  299. case '$': handleLiteral(c, LITERAL);
  300. break;
  301. case '@': handleLiteral(c, LABEL);
  302. break;
  303. case ';': add(SEMICOLON);
  304. break;
  305. case ',': add(COMMA);
  306. break;
  307. case '~': add(BIT_INVERT);
  308. break;
  309. case '+': add(next('=') ? ADD_SET: (next('+') ? INC: ADD));
  310. break;
  311. case '-': add(next('=') ? SUB_SET: (next('-') ? DEC: SUB));
  312. break;
  313. case '!': add(next('=') ? NOT_EQUAL: INVERT);
  314. break;
  315. case '=': add(next('=') ? EQUAL: SET);
  316. break;
  317. case '*': add(next('=') ? MUL_SET: MUL);
  318. break;
  319. case '/': handleSlash();
  320. break;
  321. case '%': add(next('=') ? MOD_SET: MOD);
  322. break;
  323. case '&': add(next('=') ? BIT_AND_SET: (next('&') ? AND: BIT_AND));
  324. break;
  325. case '|': add(next('=') ? BIT_OR_SET: (next('|') ? OR: BIT_OR));
  326. break;
  327. case '^': add(next('=') ? BIT_XOR_SET: BIT_XOR);
  328. break;
  329. case '<': add('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS);
  330. break;
  331. case '>': add('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER);
  332. break;
  333. default: throw PreScriptException("unknown token " + c, line);
  334. }
  335. }
  336. static void handleChar(int c)
  337. {
  338. if(isLetter(c) || c == '_' || c == '.')
  339. {
  340. handleLiteral(c, TokenType::LITERAL);
  341. }
  342. else if(isDigit(c))
  343. {
  344. handleNumber(c);
  345. }
  346. else
  347. {
  348. handleSpecial(c);
  349. }
  350. }
  351. void Tokenizer::tokenize(std::vector<Token>& inTokens, std::istream& inInput)
  352. {
  353. tokens = &inTokens;
  354. input = &inInput;
  355. line = 1;
  356. buffer = -1;
  357. int c;
  358. while((c = next()) != -1)
  359. {
  360. handleChar(c);
  361. }
  362. add(EOF_TOKEN);
  363. }