Tokenizer.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. #include <stdio.h>
  2. #include <uchar.h>
  3. #include <string.h>
  4. #include "tokenizer/Tokenizer.h"
  5. static FILE* input = NULL;
  6. static TokenStream* tokens = NULL;
  7. static unsigned int line = 1;
  8. static char32_t buffer = 0;
  9. static void tokenizer_onError(const char* message, unsigned int line) {
  10. printf("%s Line: %u\n", message, line);
  11. }
  12. static size_t tokenizer_printChar(char32_t c, char* buffer) {
  13. if(c <= 0x7F) {
  14. buffer[0] = (char) c;
  15. return 1;
  16. } else if(c < 0xE00000) {
  17. buffer[0] = (char) ((c >> 8) & 0xFF);
  18. buffer[1] = (char) ((c >> 0) & 0xFF);
  19. return 2;
  20. } else if(c <= 0xF0000000) {
  21. buffer[0] = (char) ((c >> 16) & 0xFF);
  22. buffer[1] = (char) ((c >> 8) & 0xFF);
  23. buffer[2] = (char) ((c >> 0) & 0xFF);
  24. return 3;
  25. }
  26. buffer[0] = (char) ((c >> 24) & 0xFF);
  27. buffer[1] = (char) ((c >> 16) & 0xFF);
  28. buffer[2] = (char) ((c >> 8) & 0xFF);
  29. buffer[3] = (char) ((c >> 0) & 0xFF);
  30. return 4;
  31. }
  32. static bool tokenizer_isLetter(char32_t c) {
  33. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  34. }
  35. static bool tokenizer_isDigit(char32_t c) {
  36. return c >= '0' && c <= '9';
  37. }
  38. static bool tokenizer_isValidNameStart(char32_t c) {
  39. return tokenizer_isLetter(c) || c == '.' || c == '_';
  40. }
  41. static bool tokenizer_isValidNamePart(char32_t c) {
  42. return tokenizer_isDigit(c) || tokenizer_isValidNameStart(c);
  43. }
  44. static bool tokenizer_next(char32_t* c) {
  45. if(buffer != 0) {
  46. *c = buffer;
  47. buffer = 0;
  48. return true;
  49. }
  50. int in = fgetc(input);
  51. if(in == EOF) {
  52. return false;
  53. }
  54. if((in & 0x80) == 0) {
  55. *c = in;
  56. return true;
  57. }
  58. if((in >> 5) == 0x6) {
  59. *c = (in << 8) | fgetc(input);
  60. return true;
  61. }
  62. if((in >> 4) == 0xE) {
  63. *c = (in << 16) | (fgetc(input) << 8) | fgetc(input);
  64. return true;
  65. }
  66. if((in >> 3) == 0x1E) {
  67. *c = (in << 24) | (fgetc(input) << 16) | (fgetc(input) << 8) | fgetc(input);
  68. return true;
  69. }
  70. return true;
  71. }
  72. static bool tokenizer_peek(char32_t* c) {
  73. if(buffer != 0 || tokenizer_next(&buffer)) {
  74. *c = buffer;
  75. return true;
  76. }
  77. return false;
  78. }
  79. static bool tokenizer_nextIf(char32_t c) {
  80. char32_t nextChar;
  81. if(tokenizer_peek(&nextChar) && c == nextChar) {
  82. tokenizer_next(&nextChar);
  83. return true;
  84. }
  85. return false;
  86. }
  87. static void tokenizer_addToken(Token token) {
  88. tokens->add(token, line);
  89. }
  90. static void tokenizer_addStringToken(Token token, const char* text) {
  91. tokens->add(token, line, text);
  92. }
  93. static Token tokenizer_chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) {
  94. if(tokenizer_nextIf(c)) {
  95. if(tokenizer_nextIf('=')) {
  96. return aCharEqual;
  97. }
  98. return aChar;
  99. } else if(tokenizer_nextIf('=')) {
  100. return aEqual;
  101. }
  102. return other;
  103. }
  104. static bool tokenizer_handleLiteral(char32_t c, Token token) {
  105. const size_t bufferSize = 1024;
  106. char buffer[bufferSize];
  107. size_t index = 1;
  108. buffer[0] = c;
  109. while(index < bufferSize - 1) {
  110. char32_t data;
  111. if(!tokenizer_peek(&data) || !tokenizer_isValidNamePart(data)) {
  112. break;
  113. }
  114. buffer[index++] = data;
  115. tokenizer_next(&data);
  116. }
  117. buffer[index] = '\0';
  118. if(strcmp(buffer, "if") == 0) {
  119. tokenizer_addToken(Token::IF);
  120. } else if(strcmp(buffer, "else") == 0) {
  121. tokenizer_addToken(Token::ELSE);
  122. } else if(strcmp(buffer, "elseif") == 0) {
  123. tokenizer_addToken(Token::ELSEIF);
  124. } else if(strcmp(buffer, "while") == 0) {
  125. tokenizer_addToken(Token::WHILE);
  126. } else if(strcmp(buffer, "try") == 0) {
  127. tokenizer_addToken(Token::TRY);
  128. } else if(strcmp(buffer, "catch") == 0) {
  129. tokenizer_addToken(Token::CATCH);
  130. } else if(strcmp(buffer, "for") == 0) {
  131. tokenizer_addToken(Token::FOR);
  132. } else if(strcmp(buffer, "function") == 0) {
  133. tokenizer_addToken(Token::FUNCTION);
  134. } else if(strcmp(buffer, "break") == 0) {
  135. tokenizer_addToken(Token::BREAK);
  136. } else if(strcmp(buffer, "continue") == 0) {
  137. tokenizer_addToken(Token::CONTINUE);
  138. } else if(strcmp(buffer, "return") == 0) {
  139. tokenizer_addToken(Token::RETURN);
  140. } else if(strcmp(buffer, "true") == 0) {
  141. tokenizer_addToken(Token::TRUE);
  142. } else if(strcmp(buffer, "false") == 0) {
  143. tokenizer_addToken(Token::FALSE);
  144. } else if(strcmp(buffer, "null") == 0) {
  145. tokenizer_addToken(Token::NULL_TOKEN);
  146. } else {
  147. tokenizer_addStringToken(token, buffer);
  148. }
  149. return false;
  150. }
  151. static bool tokenizer_handleNumber(char32_t c) {
  152. double number = c - '0';
  153. char32_t data;
  154. while(tokenizer_peek(&data)) {
  155. if(!tokenizer_isDigit(data)) {
  156. if(data != '.') {
  157. break;
  158. }
  159. tokenizer_next(&data);
  160. double factor = 10;
  161. while(tokenizer_peek(&data) && tokenizer_isDigit(data)) {
  162. number += (data - '0') / factor;
  163. factor *= 10;
  164. tokenizer_next(&data);
  165. }
  166. break;
  167. }
  168. number = (number * 10) + (data - '0');
  169. tokenizer_next(&data);
  170. }
  171. tokens->add(Token::NUMBER, line, number);
  172. return false;
  173. }
  174. static bool tokenizer_handleString() {
  175. const size_t bufferSize = 1024;
  176. char buffer[bufferSize];
  177. size_t index = 0;
  178. unsigned int oldLine = line;
  179. while(index + 4 < bufferSize) {
  180. char32_t data;
  181. if(!tokenizer_next(&data)) {
  182. tokenizer_onError("non closed string literal", oldLine);
  183. return true;
  184. }
  185. if(data == '"') {
  186. buffer[index] = '\0';
  187. tokenizer_addStringToken(Token::STRING, buffer);
  188. return false;
  189. }
  190. if(data == '\n') {
  191. line++;
  192. }
  193. if(data == '\\') {
  194. char32_t escape;
  195. if(!tokenizer_next(&escape)) {
  196. tokenizer_onError("missing escaped character", line);
  197. return true;
  198. }
  199. switch(escape) {
  200. case 'n': data = '\n';
  201. break;
  202. case '\\': data = '\\';
  203. break;
  204. case '"': data = '"';
  205. break;
  206. default:
  207. tokenizer_onError("invalid escaped character", line);
  208. return true;
  209. }
  210. }
  211. index += tokenizer_printChar(data, buffer + index);
  212. }
  213. tokenizer_onError("string buffer to small", line);
  214. return true;
  215. }
  216. static bool tokenizer_handleOneLineComment() {
  217. char32_t data;
  218. while(tokenizer_next(&data) && data != '\n');
  219. line++;
  220. return false;
  221. }
  222. static bool tokenizer_handleMultiLineComment() {
  223. char32_t first;
  224. char32_t sec = 0;
  225. unsigned int oldLine = line;
  226. while(true) {
  227. first = sec;
  228. if(!tokenizer_next(&sec)) {
  229. tokenizer_onError("unclosed multiline comment", oldLine);
  230. return true;
  231. }
  232. if(first == '*' && sec == '/') {
  233. return false;
  234. }
  235. line += (sec == '\n');
  236. }
  237. }
  238. static bool tokenizer_handleSlash() {
  239. if(tokenizer_nextIf('/')) {
  240. return tokenizer_handleOneLineComment();
  241. } else if(tokenizer_nextIf('*')) {
  242. return tokenizer_handleMultiLineComment();
  243. } else if(tokenizer_nextIf('=')) {
  244. tokenizer_addToken(Token::DIV_SET);
  245. return false;
  246. }
  247. tokenizer_addToken(Token::DIV);
  248. return false;
  249. }
  250. static bool tokenizer_handleSpecial(char32_t c) {
  251. switch(c) {
  252. case ' ':
  253. case '\t':
  254. case '\r':
  255. return false;
  256. case '\n': line++;
  257. return false;
  258. case '"':
  259. return tokenizer_handleString();
  260. case '(': tokenizer_addToken(Token::OPEN_BRACKET);
  261. return false;
  262. case ')': tokenizer_addToken(Token::CLOSE_BRACKET);
  263. return false;
  264. case '[': tokenizer_addToken(Token::OPEN_SQUARE_BRACKET);
  265. return false;
  266. case ']': tokenizer_addToken(Token::CLOSE_SQUARE_BRACKET);
  267. return false;
  268. case '{': tokenizer_addToken(Token::OPEN_CURVED_BRACKET);
  269. return false;
  270. case '}': tokenizer_addToken(Token::CLOSE_CURVED_BRACKET);
  271. return false;
  272. case '$':
  273. return tokenizer_handleLiteral(c, Token::LITERAL);
  274. case '@':
  275. return tokenizer_handleLiteral(c, Token::LABEL);
  276. case ';': tokenizer_addToken(Token::SEMICOLON);
  277. return false;
  278. case ',': tokenizer_addToken(Token::COMMA);
  279. return false;
  280. case '~': tokenizer_addToken(Token::BIT_INVERT);
  281. return false;
  282. case '+': tokenizer_addToken(tokenizer_nextIf('=') ? Token::ADD_SET: (tokenizer_nextIf('+') ? Token::INC: Token::ADD));
  283. return false;
  284. case '-': tokenizer_addToken(tokenizer_nextIf('=') ? Token::SUB_SET: (tokenizer_nextIf('-') ? Token::DEC: Token::SUB));
  285. return false;
  286. case '!': tokenizer_addToken(tokenizer_nextIf('=') ? Token::NOT_EQUAL: Token::INVERT);
  287. break;
  288. case '=': tokenizer_addToken(tokenizer_nextIf('=') ? Token::EQUAL: Token::SET);
  289. return false;
  290. case '*': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MUL_SET: Token::MUL);
  291. return false;
  292. case '/':
  293. return tokenizer_handleSlash();
  294. case '%': tokenizer_addToken(tokenizer_nextIf('=') ? Token::MOD_SET: Token::MOD);
  295. return false;
  296. case '&': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_AND_SET: (tokenizer_nextIf('&') ? Token::AND: Token::BIT_AND));
  297. return false;
  298. case '|': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_OR_SET: (tokenizer_nextIf('|') ? Token::OR: Token::BIT_OR));
  299. return false;
  300. case '^': tokenizer_addToken(tokenizer_nextIf('=') ? Token::BIT_XOR_SET: Token::BIT_XOR);
  301. return false;
  302. case '<': tokenizer_addToken(tokenizer_chooseToken('<', Token::LEFT_SHIFT_SET, Token::LEFT_SHIFT, Token::LESS_EQUAL, Token::LESS));
  303. return false;
  304. case '>': tokenizer_addToken(tokenizer_chooseToken('>', Token::RIGHT_SHIFT_SET, Token::RIGHT_SHIFT, Token::GREATER_EQUAL, Token::GREATER));
  305. return false;
  306. }
  307. char buffer[32];
  308. strncpy(buffer, "unknown token '", 32);
  309. size_t index = strlen(buffer);
  310. index += tokenizer_printChar(c, buffer + index);
  311. buffer[index] = '\'';
  312. buffer[index + 1] = '\0';
  313. tokenizer_onError(buffer, line);
  314. return true;
  315. }
  316. static bool tokenizer_handleChar(char32_t c) {
  317. if(tokenizer_isValidNameStart(c)) {
  318. return tokenizer_handleLiteral(c, Token::LITERAL);
  319. } else if(tokenizer_isDigit(c)) {
  320. return tokenizer_handleNumber(c);
  321. }
  322. return tokenizer_handleSpecial(c);
  323. }
  324. bool tokenize(TokenStream* tokenStream, const char* inputPath) {
  325. input = fopen(inputPath, "r");
  326. if(input == NULL) {
  327. return true;
  328. }
  329. tokens = tokenStream;
  330. line = 1;
  331. buffer = 0;
  332. char32_t c;
  333. while(tokenizer_next(&c)) {
  334. if(tokenizer_handleChar(c)) {
  335. return true;
  336. }
  337. }
  338. tokenizer_addToken(Token::EOF_TOKEN);
  339. fclose(input);
  340. input = NULL;
  341. return false;
  342. }