Tokenizer.cpp 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #include <sstream>
  2. #include "tokenizer/Tokenizer.h"
  3. static unsigned int line = 1;
  4. static TokenStream* tokens = nullptr;
  5. static Tokenizer::i32stream* input = nullptr;
  6. static char32_t buffer = 0;
  7. static void onError(const std::string& message, unsigned int line) {
  8. std::cout << message << " Line: " << line << std::endl;
  9. }
  10. static void convertChar(char32_t c, char* buffer) {
  11. if(c <= 0x7F) {
  12. buffer[0] = (char) c;
  13. buffer[1] = '\0';
  14. } else if(c <= 0x7FF) {
  15. buffer[0] = (char) (0xC0 | ((c >> 6) & 0x1F));
  16. buffer[1] = (char) (0x80 | ((c >> 0) & 0x3F));
  17. buffer[2] = '\0';
  18. } else if(c <= 0xFFFF) {
  19. buffer[0] = (char) (0xE0 | ((c >> 12) & 0x0F));
  20. buffer[1] = (char) (0x80 | ((c >> 6) & 0x3F));
  21. buffer[2] = (char) (0x80 | ((c >> 0) & 0x3F));
  22. buffer[3] = '\0';
  23. } else {
  24. buffer[0] = (char) (0xF0 | ((c >> 18) & 0x07));
  25. buffer[1] = (char) (0x80 | ((c >> 12) & 0x3F));
  26. buffer[2] = (char) (0x80 | ((c >> 6) & 0x3F));
  27. buffer[3] = (char) (0x80 | ((c >> 0) & 0x3F));
  28. buffer[4] = '\0';
  29. }
  30. }
  31. static bool isLetter(char32_t c) {
  32. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  33. }
  34. static bool isDigit(char32_t c) {
  35. return c >= '0' && c <= '9';
  36. }
  37. static bool isValidNameStart(char32_t c) {
  38. return isLetter(c) || c == '.' || c == '_';
  39. }
  40. static bool isValidNamePart(char32_t c) {
  41. return isDigit(c) || isValidNameStart(c);
  42. }
  43. static bool next(char32_t& c) {
  44. if(buffer != 0) {
  45. c = buffer;
  46. buffer = 0;
  47. return true;
  48. }
  49. c = input->get();
  50. return input->good();
  51. }
  52. static bool peek(char32_t& c) {
  53. if(buffer != 0 || next(buffer)) {
  54. c = buffer;
  55. return true;
  56. }
  57. return false;
  58. }
  59. static bool nextIf(char32_t c) {
  60. char32_t nextChar;
  61. if(peek(nextChar) && c == nextChar) {
  62. next(nextChar);
  63. return true;
  64. }
  65. return false;
  66. }
  67. static void add(TokenType type) {
  68. tokens->add(type, line);
  69. }
  70. static void add(TokenType type, const std::string& text) {
  71. tokens->add(type, line, text);
  72. }
  73. static TokenType chooseTokenType(char c, TokenType aCharEqual, TokenType aChar, TokenType aEqual, TokenType other) {
  74. if(nextIf(c)) {
  75. if(nextIf('=')) {
  76. return aCharEqual;
  77. }
  78. return aChar;
  79. } else if(nextIf('=')) {
  80. return aEqual;
  81. }
  82. return other;
  83. }
  84. static bool handleLiteral(char32_t c, TokenType type) {
  85. std::stringstream sBuilder;
  86. sBuilder << (char) c;
  87. while(true) {
  88. char32_t data;
  89. if(!peek(data) || !isValidNamePart(data)) {
  90. break;
  91. }
  92. sBuilder << (char) data;
  93. next(data);
  94. }
  95. std::string s = sBuilder.str();
  96. if(s == "if") {
  97. add(TokenType::IF);
  98. } else if(s == "if") {
  99. add(TokenType::IF);
  100. } else if(s == "else") {
  101. add(TokenType::ELSE);
  102. } else if(s == "elseif") {
  103. add(TokenType::ELSEIF);
  104. } else if(s == "while") {
  105. add(TokenType::WHILE);
  106. } else if(s == "try") {
  107. add(TokenType::TRY);
  108. } else if(s == "catch") {
  109. add(TokenType::CATCH);
  110. } else if(s == "for") {
  111. add(TokenType::FOR);
  112. } else if(s == "function") {
  113. add(TokenType::FUNCTION);
  114. } else if(s == "break") {
  115. add(TokenType::BREAK);
  116. } else if(s == "continue") {
  117. add(TokenType::CONTINUE);
  118. } else if(s == "return") {
  119. add(TokenType::RETURN);
  120. } else if(s == "true") {
  121. add(TokenType::TRUE);
  122. } else if(s == "false") {
  123. add(TokenType::FALSE);
  124. } else if(s == "null") {
  125. add(TokenType::NULL_TOKEN);
  126. } else {
  127. add(type, s);
  128. }
  129. return false;
  130. }
  131. static bool handleNumber(char32_t c) {
  132. double number = c - '0';
  133. char32_t data;
  134. while(peek(data)) {
  135. if(!isDigit(data)) {
  136. if(data != '.') {
  137. break;
  138. }
  139. next(data);
  140. double factor = 10;
  141. while(peek(data) && isDigit(data)) {
  142. number += (data - '0') / factor;
  143. factor *= 10;
  144. next(data);
  145. }
  146. break;
  147. }
  148. number = (number * 10) + (data - '0');
  149. next(data);
  150. }
  151. tokens->add(NUMBER, line, number);
  152. return false;
  153. }
  154. static bool handleString() {
  155. std::stringstream ss;
  156. unsigned int oldLine = line;
  157. while(true) {
  158. char32_t data;
  159. if(!next(data)) {
  160. onError("non closed string literal", oldLine);
  161. return true;
  162. }
  163. if(data == '"') {
  164. add(STRING, ss.str());
  165. return false;
  166. }
  167. if(data == '\n') {
  168. line++;
  169. }
  170. if(data == '\\') {
  171. char32_t escape;
  172. if(!next(escape)) {
  173. onError("missing escaped character", line);
  174. return true;
  175. }
  176. switch(escape) {
  177. case 'n': data = '\n';
  178. break;
  179. case '\\': data = '\\';
  180. break;
  181. case '"': data = '"';
  182. break;
  183. default:
  184. onError("invalid escaped character", line);
  185. return true;
  186. }
  187. }
  188. char buffer[5];
  189. convertChar(data, buffer);
  190. ss << buffer;
  191. }
  192. }
  193. static bool handleOneLineComment() {
  194. char32_t data;
  195. while(next(data) && data != '\n');
  196. line++;
  197. return false;
  198. }
  199. static bool handleMultiLineComment() {
  200. char32_t first;
  201. char32_t sec = 0;
  202. unsigned int oldLine = line;
  203. while(true) {
  204. first = sec;
  205. if(!next(sec)) {
  206. onError("unclosed multiline comment", oldLine);
  207. return true;
  208. }
  209. if(first == '*' && sec == '/') {
  210. return false;
  211. }
  212. line += (sec == '\n');
  213. }
  214. }
  215. static bool handleSlash() {
  216. if(nextIf('/')) {
  217. return handleOneLineComment();
  218. } else if(nextIf('*')) {
  219. return handleMultiLineComment();
  220. } else if(nextIf('=')) {
  221. add(DIV_SET);
  222. return false;
  223. }
  224. add(DIV);
  225. return false;
  226. }
  227. static bool handleSpecial(char32_t c) {
  228. switch(c) {
  229. case ' ':
  230. case '\t':
  231. case '\r':
  232. return false;
  233. case '\n': line++;
  234. return false;
  235. case '"':
  236. return handleString();
  237. case '(': add(OPEN_BRACKET);
  238. return false;
  239. case ')': add(CLOSE_BRACKET);
  240. return false;
  241. case '[': add(OPEN_SQUARE_BRACKET);
  242. return false;
  243. case ']': add(CLOSE_SQUARE_BRACKET);
  244. return false;
  245. case '{': add(OPEN_CURVED_BRACKET);
  246. return false;
  247. case '}': add(CLOSE_CURVED_BRACKET);
  248. return false;
  249. case '$':
  250. return handleLiteral(c, LITERAL);
  251. case '@':
  252. return handleLiteral(c, LABEL);
  253. case ';': add(SEMICOLON);
  254. return false;
  255. case ',': add(COMMA);
  256. return false;
  257. case '~': add(BIT_INVERT);
  258. return false;
  259. case '+': add(nextIf('=') ? ADD_SET: (nextIf('+') ? INC: ADD));
  260. return false;
  261. case '-': add(nextIf('=') ? SUB_SET: (nextIf('-') ? DEC: SUB));
  262. return false;
  263. case '!': add(nextIf('=') ? NOT_EQUAL: INVERT);
  264. break;
  265. case '=': add(nextIf('=') ? EQUAL: SET);
  266. return false;
  267. case '*': add(nextIf('=') ? MUL_SET: MUL);
  268. return false;
  269. case '/':
  270. return handleSlash();
  271. case '%': add(nextIf('=') ? MOD_SET: MOD);
  272. return false;
  273. case '&': add(nextIf('=') ? BIT_AND_SET: (nextIf('&') ? AND: BIT_AND));
  274. return false;
  275. case '|': add(nextIf('=') ? BIT_OR_SET: (nextIf('|') ? OR: BIT_OR));
  276. return false;
  277. case '^': add(nextIf('=') ? BIT_XOR_SET: BIT_XOR);
  278. return false;
  279. case '<': add(chooseTokenType('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS));
  280. return false;
  281. case '>': add(chooseTokenType('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER));
  282. return false;
  283. }
  284. char buffer[5];
  285. convertChar(c, buffer);
  286. onError(std::string("unknown token '") + buffer + "'", line);
  287. return true;
  288. }
  289. static bool handleChar(char32_t c) {
  290. if(isValidNameStart(c)) {
  291. return handleLiteral(c, TokenType::LITERAL);
  292. } else if(isDigit(c)) {
  293. return handleNumber(c);
  294. }
  295. return handleSpecial(c);
  296. }
  297. bool Tokenizer::tokenize(TokenStream& inTokens, i32stream& inInput) {
  298. tokens = &inTokens;
  299. input = &inInput;
  300. line = 1;
  301. buffer = 0;
  302. char32_t c;
  303. while(next(c)) {
  304. if(handleChar(c)) {
  305. return true;
  306. }
  307. }
  308. add(EOF_TOKEN);
  309. return false;
  310. }