Tokenizer.cpp 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. #include "Tokenizer.h"
  2. #include "../Utils.h"
  3. #include "../Exception.h"
  4. Tokenizer::Tokenizer(string s)
  5. {
  6. data = s;
  7. line = 0;
  8. pos = 0;
  9. length = 0;
  10. }
  11. Tokenizer::Tokenizer(const Tokenizer& orig)
  12. {
  13. }
  14. Tokenizer::~Tokenizer()
  15. {
  16. }
  17. void Tokenizer::tokenize(TokenList& tokens, char c, TokenType type1, TokenType type2)
  18. {
  19. if(pos + 1 < length && data[pos + 1] == c)
  20. {
  21. tokens.add(new Token(type1, line));
  22. pos++;
  23. }
  24. else
  25. {
  26. tokens.add(new Token(type2, line));
  27. }
  28. }
  29. void Tokenizer::tokenize(TokenList& tokens, TokenType type1, char c2, TokenType type2, char c3, TokenType type3)
  30. {
  31. if(pos + 1 >= length)
  32. {
  33. tokens.add(new Token(type1, line));
  34. }
  35. else if(data[pos + 1] == c2)
  36. {
  37. tokens.add(new Token(type2, line));
  38. pos++;
  39. }
  40. else if(data[pos + 1] == c3)
  41. {
  42. tokens.add(new Token(type3, line));
  43. pos++;
  44. }
  45. else
  46. {
  47. tokens.add(new Token(type1, line));
  48. }
  49. }
  50. void Tokenizer::tokenize(TokenList& tokens, TokenType type1, char c2, char c3, TokenType type2, TokenType type3, char c4, TokenType type4)
  51. {
  52. if(pos + 1 >= length)
  53. {
  54. tokens.add(new Token(type1, line));
  55. }
  56. else if(data[pos + 1] == c2)
  57. {
  58. if(pos + 2 < length && data[pos + 2] == c3)
  59. {
  60. tokens.add(new Token(type2, line));
  61. pos += 2;
  62. }
  63. else
  64. {
  65. tokens.add(new Token(type3, line));
  66. pos++;
  67. }
  68. }
  69. else if(data[pos + 1] == c4)
  70. {
  71. tokens.add(new Token(type4, line));
  72. pos++;
  73. }
  74. else
  75. {
  76. tokens.add(new Token(type1, line));
  77. }
  78. }
  79. void Tokenizer::tokenize(TokenList& tokens)
  80. {
  81. line = 1;
  82. pos = 0;
  83. length = data.size();
  84. while(pos < length)
  85. {
  86. if(isLetter(data[pos]))
  87. {
  88. int old = pos;
  89. pos++;
  90. while(pos < length && isAllowedInName(data[pos]))
  91. {
  92. pos++;
  93. }
  94. string s = data.substr(old, pos - old);
  95. if(s == "if") {tokens.add(new Token(Tokens::IF, line));}
  96. else if(s == "elseif") {tokens.add(new Token(Tokens::ELSE_IF, line));}
  97. else if(s == "else") {tokens.add(new Token(Tokens::ELSE, line));}
  98. else if(s == "for") {tokens.add(new Token(Tokens::FOR, line));}
  99. else if(s == "while") {tokens.add(new Token(Tokens::WHILE, line));}
  100. else if(s == "function") {tokens.add(new Token(Tokens::FUNCTION, line));}
  101. else if(s == "break") {tokens.add(new Token(Tokens::BREAK, line));}
  102. else if(s == "continue") {tokens.add(new Token(Tokens::CONTINUE, line));}
  103. else if(s == "return") {tokens.add(new Token(Tokens::RETURN, line));}
  104. else if(s == "try") {tokens.add(new Token(Tokens::TRY, line));}
  105. else if(s == "catch") {tokens.add(new Token(Tokens::CATCH, line));}
  106. else if(s == "true") {tokens.add(new Token(Tokens::TRUE, line));}
  107. else if(s == "false") {tokens.add(new Token(Tokens::FALSE, line));}
  108. else if(s == "null") {tokens.add(new Token(Tokens::TNULL, line));}
  109. else
  110. {
  111. Token* t = new Token(Tokens::VAR, line);
  112. t->setString(s);
  113. tokens.add(t);
  114. }
  115. pos--;
  116. }
  117. else if(isDigit(data[pos]))
  118. {
  119. int old = pos;
  120. pos++;
  121. while(pos < length && isDigit(data[pos]))
  122. {
  123. pos++;
  124. }
  125. if(pos < length && data[pos] == '.')
  126. {
  127. pos++;
  128. while(pos < length && isDigit(data[pos]))
  129. {
  130. pos++;
  131. }
  132. }
  133. string s = data.substr(old, pos - old);
  134. try
  135. {
  136. float f = stof(s);
  137. Token* t = new Token(Tokens::FLOAT, line);
  138. t->setFloat(f);
  139. tokens.add(t);
  140. }
  141. catch(std::out_of_range ex)
  142. {
  143. throw Exception("invalid float", line);
  144. }
  145. catch(std::invalid_argument ex)
  146. {
  147. throw Exception("invalid float", line);
  148. }
  149. pos--;
  150. }
  151. else
  152. {
  153. switch(data[pos])
  154. {
  155. case '@':
  156. {
  157. int old = pos;
  158. pos++;
  159. while(pos < length && isAllowedInName(data[pos]))
  160. {
  161. pos++;
  162. }
  163. string s = data.substr(old, pos - old);
  164. Token* t = new Token(Tokens::LABEL, line);
  165. t->setString(s);
  166. tokens.add(t);
  167. pos--;
  168. break;
  169. }
  170. case '"':
  171. {
  172. pos++;
  173. int old = pos;
  174. while(pos < length && data[pos] != '"')
  175. {
  176. pos++;
  177. }
  178. string s = data.substr(old, pos - old);
  179. Token* t = new Token(Tokens::TEXT, line);
  180. t->setString(s);
  181. tokens.add(t);
  182. break;
  183. }
  184. case '/':
  185. {
  186. if(pos + 1 >= length)
  187. {
  188. tokens.add(new Token(Tokens::DIV, line));
  189. }
  190. else
  191. {
  192. switch(data[pos + 1])
  193. {
  194. case '/':
  195. pos += 2;
  196. while(pos < length && data[pos] != '\n')
  197. {
  198. pos++;
  199. }
  200. pos--;
  201. break;
  202. case '*':
  203. pos += 2;
  204. while(pos + 1 < length && (data[pos] != '*' || data[pos + 1] != '/'))
  205. {
  206. if(data[pos] == '\n')
  207. {
  208. line++;
  209. }
  210. pos++;
  211. }
  212. pos++;
  213. break;
  214. case '=':
  215. tokens.add(new Token(Tokens::DIV_SET, line));
  216. pos++;
  217. break;
  218. default:
  219. tokens.add(new Token(Tokens::DIV, line));
  220. }
  221. }
  222. break;
  223. }
  224. case '<': tokenize(tokens, Tokens::LESS, '<', '=', Tokens::LEFT_SHIFT_SET, Tokens::LEFT_SHIFT, '=', Tokens::LESS_EQUAL); break;
  225. case '>': tokenize(tokens, Tokens::GREATER, '>', '=', Tokens::RIGHT_SHIFT_SET, Tokens::RIGHT_SHIFT, '=', Tokens::GREATER_EQUAL); break;
  226. case '&': tokenize(tokens, Tokens::BIT_AND, '&', Tokens::AND, '=', Tokens::BIT_AND_SET); break;
  227. case '|': tokenize(tokens, Tokens::BIT_OR, '|', Tokens::OR, '=', Tokens::BIT_OR_SET); break;
  228. case '+': tokenize(tokens, Tokens::ADD, '+', Tokens::INC, '=', Tokens::ADD_SET); break;
  229. case '-': tokenize(tokens, Tokens::SUB, '-', Tokens::DEC, '=', Tokens::SUB_SET); break;
  230. case '*': tokenize(tokens, '=', Tokens::MUL_SET, Tokens::MUL); break;
  231. case '\n': line++; break;
  232. case '!': tokenize(tokens, '=', Tokens::NOT_EQUAL, Tokens::INVERT); break;
  233. case '%': tokenize(tokens, '=', Tokens::MOD_SET, Tokens::MOD); break;
  234. case '=': tokenize(tokens, '=', Tokens::EQUAL, Tokens::SET); break;
  235. case '^': tokenize(tokens, '=', Tokens::BIT_XOR_SET, Tokens::BIT_XOR); break;
  236. case '~': tokens.add(new Token(Tokens::BIT_INVERT, line)); break;
  237. case ',': tokens.add(new Token(Tokens::COMMA, line)); break;
  238. case '(': tokens.add(new Token(Tokens::OPEN_BRACKET, line)); break;
  239. case ')': tokens.add(new Token(Tokens::CLOSE_BRACKET, line)); break;
  240. case '[': tokens.add(new Token(Tokens::OPEN_SQUARE_BRACKET, line)); break;
  241. case ']': tokens.add(new Token(Tokens::CLOSE_SQUARE_BRACKET, line)); break;
  242. case '{': tokens.add(new Token(Tokens::OPEN_CURVED_BRACKET, line)); break;
  243. case '}': tokens.add(new Token(Tokens::CLOSE_CURVED_BRACKET, line)); break;
  244. case ';': tokens.add(new Token(Tokens::SEMICOLON, line)); break;
  245. case '$': tokens.add(new Token(Tokens::GLOBAL, line)); break;
  246. case ' ': break;
  247. default: throw Exception(string("invalid token ") + data[pos], line);
  248. }
  249. }
  250. pos++;
  251. }
  252. tokens.add(new Token(Tokens::END_OF_FILE, line));
  253. }