Tokenizer.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. #include <stdio.h>
  2. #include <uchar.h>
  3. #include <string.h>
  4. #include "tokenizer/Tokenizer.h"
  5. static FILE* input = NULL;
  6. static TokenStream* tokens = NULL;
  7. static unsigned int line = 1;
  8. static char32_t buffer = 0;
  9. static void tokenizer_onError(const char* message, unsigned int line) {
  10. printf("%s Line: %u\n", message, line);
  11. }
  12. static size_t tokenizer_printChar(char32_t c, char* buffer) {
  13. if(c <= 0x7F) {
  14. buffer[0] = (char) c;
  15. return 1;
  16. } else if(c < 0xE00000) {
  17. buffer[0] = (char) ((c >> 8) & 0xFF);
  18. buffer[1] = (char) ((c >> 0) & 0xFF);
  19. return 2;
  20. } else if(c <= 0xF0000000) {
  21. buffer[0] = (char) ((c >> 16) & 0xFF);
  22. buffer[1] = (char) ((c >> 8) & 0xFF);
  23. buffer[2] = (char) ((c >> 0) & 0xFF);
  24. return 3;
  25. }
  26. buffer[0] = (char) ((c >> 24) & 0xFF);
  27. buffer[1] = (char) ((c >> 16) & 0xFF);
  28. buffer[2] = (char) ((c >> 8) & 0xFF);
  29. buffer[3] = (char) ((c >> 0) & 0xFF);
  30. return 4;
  31. }
  32. static bool tokenizer_isLetter(char32_t c) {
  33. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  34. }
  35. static bool tokenizer_isDigit(char32_t c) {
  36. return c >= '0' && c <= '9';
  37. }
  38. static bool tokenizer_isValidNameStart(char32_t c) {
  39. return tokenizer_isLetter(c) || c == '.' || c == '_';
  40. }
  41. static bool tokenizer_isValidNamePart(char32_t c) {
  42. return tokenizer_isDigit(c) || tokenizer_isValidNameStart(c);
  43. }
  44. static bool tokenizer_next(char32_t* c) {
  45. if(buffer != 0) {
  46. *c = buffer;
  47. buffer = 0;
  48. return true;
  49. }
  50. int in = fgetc(input);
  51. if(in == EOF) {
  52. return false;
  53. }
  54. if((in & 0x80) == 0) {
  55. *c = in;
  56. return true;
  57. }
  58. if((in >> 5) == 0x6) {
  59. *c = (in << 8) | fgetc(input);
  60. return true;
  61. }
  62. if((in >> 4) == 0xE) {
  63. *c = (in << 16) | (fgetc(input) << 8) | fgetc(input);
  64. return true;
  65. }
  66. if((in >> 3) == 0x1E) {
  67. *c = (in << 24) | (fgetc(input) << 16) | (fgetc(input) << 8) | fgetc(input);
  68. return true;
  69. }
  70. return true;
  71. }
  72. static bool tokenizer_peek(char32_t* c) {
  73. if(buffer != 0 || tokenizer_next(&buffer)) {
  74. *c = buffer;
  75. return true;
  76. }
  77. return false;
  78. }
  79. static bool tokenizer_nextIf(char32_t c) {
  80. char32_t nextChar;
  81. if(tokenizer_peek(&nextChar) && c == nextChar) {
  82. tokenizer_next(&nextChar);
  83. return true;
  84. }
  85. return false;
  86. }
  87. static void tokenizer_addToken(Token token) {
  88. addToken(tokens, token, line);
  89. }
  90. static void tokenizer_addStringToken(Token token, const char* text) {
  91. addStringToken(tokens, token, line, text);
  92. }
  93. static Token tokenizer_chooseToken(char c, Token aCharEqual, Token aChar, Token aEqual, Token other) {
  94. if(tokenizer_nextIf(c)) {
  95. if(tokenizer_nextIf('=')) {
  96. return aCharEqual;
  97. }
  98. return aChar;
  99. } else if(tokenizer_nextIf('=')) {
  100. return aEqual;
  101. }
  102. return other;
  103. }
  104. static bool tokenizer_handleLiteral(char32_t c, Token token) {
  105. const size_t bufferSize = 1024;
  106. char buffer[bufferSize];
  107. size_t index = 1;
  108. buffer[0] = c;
  109. while(index < bufferSize - 1) {
  110. char32_t data;
  111. if(!tokenizer_peek(&data) || !tokenizer_isValidNamePart(data)) {
  112. break;
  113. }
  114. buffer[index++] = data;
  115. tokenizer_next(&data);
  116. }
  117. buffer[index] = '\0';
  118. if(strcmp(buffer, "if") == 0) {
  119. tokenizer_addToken(IF);
  120. } else if(strcmp(buffer, "if") == 0) {
  121. tokenizer_addToken(IF);
  122. } else if(strcmp(buffer, "else") == 0) {
  123. tokenizer_addToken(ELSE);
  124. } else if(strcmp(buffer, "elseif") == 0) {
  125. tokenizer_addToken(ELSEIF);
  126. } else if(strcmp(buffer, "while") == 0) {
  127. tokenizer_addToken(WHILE);
  128. } else if(strcmp(buffer, "try") == 0) {
  129. tokenizer_addToken(TRY);
  130. } else if(strcmp(buffer, "catch") == 0) {
  131. tokenizer_addToken(CATCH);
  132. } else if(strcmp(buffer, "for") == 0) {
  133. tokenizer_addToken(FOR);
  134. } else if(strcmp(buffer, "function") == 0) {
  135. tokenizer_addToken(FUNCTION);
  136. } else if(strcmp(buffer, "break") == 0) {
  137. tokenizer_addToken(BREAK);
  138. } else if(strcmp(buffer, "continue") == 0) {
  139. tokenizer_addToken(CONTINUE);
  140. } else if(strcmp(buffer, "return") == 0) {
  141. tokenizer_addToken(RETURN);
  142. } else if(strcmp(buffer, "true") == 0) {
  143. tokenizer_addToken(TRUE);
  144. } else if(strcmp(buffer, "false") == 0) {
  145. tokenizer_addToken(FALSE);
  146. } else if(strcmp(buffer, "null") == 0) {
  147. tokenizer_addToken(NULL_TOKEN);
  148. } else {
  149. tokenizer_addStringToken(token, buffer);
  150. }
  151. return false;
  152. }
  153. static bool tokenizer_handleNumber(char32_t c) {
  154. double number = c - '0';
  155. char32_t data;
  156. while(tokenizer_peek(&data)) {
  157. if(!tokenizer_isDigit(data)) {
  158. if(data != '.') {
  159. break;
  160. }
  161. tokenizer_next(&data);
  162. double factor = 10;
  163. while(tokenizer_peek(&data) && tokenizer_isDigit(data)) {
  164. number += (data - '0') / factor;
  165. factor *= 10;
  166. tokenizer_next(&data);
  167. }
  168. break;
  169. }
  170. number = (number * 10) + (data - '0');
  171. tokenizer_next(&data);
  172. }
  173. addDoubleToken(tokens, NUMBER, line, number);
  174. return false;
  175. }
  176. static bool tokenizer_handleString() {
  177. const size_t bufferSize = 1024;
  178. char buffer[bufferSize];
  179. size_t index = 0;
  180. unsigned int oldLine = line;
  181. while(index + 4 < bufferSize) {
  182. char32_t data;
  183. if(!tokenizer_next(&data)) {
  184. tokenizer_onError("non closed string literal", oldLine);
  185. return true;
  186. }
  187. if(data == '"') {
  188. buffer[index] = '\0';
  189. tokenizer_addStringToken(STRING, buffer);
  190. return false;
  191. }
  192. if(data == '\n') {
  193. line++;
  194. }
  195. if(data == '\\') {
  196. char32_t escape;
  197. if(!tokenizer_next(&escape)) {
  198. tokenizer_onError("missing escaped character", line);
  199. return true;
  200. }
  201. switch(escape) {
  202. case 'n': data = '\n';
  203. break;
  204. case '\\': data = '\\';
  205. break;
  206. case '"': data = '"';
  207. break;
  208. default:
  209. tokenizer_onError("invalid escaped character", line);
  210. return true;
  211. }
  212. }
  213. index += tokenizer_printChar(data, buffer + index);
  214. }
  215. tokenizer_onError("string buffer to small", line);
  216. return true;
  217. }
  218. static bool tokenizer_handleOneLineComment() {
  219. char32_t data;
  220. while(tokenizer_next(&data) && data != '\n');
  221. line++;
  222. return false;
  223. }
  224. static bool tokenizer_handleMultiLineComment() {
  225. char32_t first;
  226. char32_t sec = 0;
  227. unsigned int oldLine = line;
  228. while(true) {
  229. first = sec;
  230. if(!tokenizer_next(&sec)) {
  231. tokenizer_onError("unclosed multiline comment", oldLine);
  232. return true;
  233. }
  234. if(first == '*' && sec == '/') {
  235. return false;
  236. }
  237. line += (sec == '\n');
  238. }
  239. }
  240. static bool tokenizer_handleSlash() {
  241. if(tokenizer_nextIf('/')) {
  242. return tokenizer_handleOneLineComment();
  243. } else if(tokenizer_nextIf('*')) {
  244. return tokenizer_handleMultiLineComment();
  245. } else if(tokenizer_nextIf('=')) {
  246. tokenizer_addToken(DIV_SET);
  247. return false;
  248. }
  249. tokenizer_addToken(DIV);
  250. return false;
  251. }
  252. static bool tokenizer_handleSpecial(char32_t c) {
  253. switch(c) {
  254. case ' ':
  255. case '\t':
  256. case '\r':
  257. return false;
  258. case '\n': line++;
  259. return false;
  260. case '"':
  261. return tokenizer_handleString();
  262. case '(': tokenizer_addToken(OPEN_BRACKET);
  263. return false;
  264. case ')': tokenizer_addToken(CLOSE_BRACKET);
  265. return false;
  266. case '[': tokenizer_addToken(OPEN_SQUARE_BRACKET);
  267. return false;
  268. case ']': tokenizer_addToken(CLOSE_SQUARE_BRACKET);
  269. return false;
  270. case '{': tokenizer_addToken(OPEN_CURVED_BRACKET);
  271. return false;
  272. case '}': tokenizer_addToken(CLOSE_CURVED_BRACKET);
  273. return false;
  274. case '$':
  275. return tokenizer_handleLiteral(c, LITERAL);
  276. case '@':
  277. return tokenizer_handleLiteral(c, LABEL);
  278. case ';': tokenizer_addToken(SEMICOLON);
  279. return false;
  280. case ',': tokenizer_addToken(COMMA);
  281. return false;
  282. case '~': tokenizer_addToken(BIT_INVERT);
  283. return false;
  284. case '+': tokenizer_addToken(tokenizer_nextIf('=') ? ADD_SET: (tokenizer_nextIf('+') ? INC: ADD));
  285. return false;
  286. case '-': tokenizer_addToken(tokenizer_nextIf('=') ? SUB_SET: (tokenizer_nextIf('-') ? DEC: SUB));
  287. return false;
  288. case '!': tokenizer_addToken(tokenizer_nextIf('=') ? NOT_EQUAL: INVERT);
  289. break;
  290. case '=': tokenizer_addToken(tokenizer_nextIf('=') ? EQUAL: SET);
  291. return false;
  292. case '*': tokenizer_addToken(tokenizer_nextIf('=') ? MUL_SET: MUL);
  293. return false;
  294. case '/':
  295. return tokenizer_handleSlash();
  296. case '%': tokenizer_addToken(tokenizer_nextIf('=') ? MOD_SET: MOD);
  297. return false;
  298. case '&': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_AND_SET: (tokenizer_nextIf('&') ? AND: BIT_AND));
  299. return false;
  300. case '|': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_OR_SET: (tokenizer_nextIf('|') ? OR: BIT_OR));
  301. return false;
  302. case '^': tokenizer_addToken(tokenizer_nextIf('=') ? BIT_XOR_SET: BIT_XOR);
  303. return false;
  304. case '<': tokenizer_addToken(tokenizer_chooseToken('<', LEFT_SHIFT_SET, LEFT_SHIFT, LESS_EQUAL, LESS));
  305. return false;
  306. case '>': tokenizer_addToken(tokenizer_chooseToken('>', RIGHT_SHIFT_SET, RIGHT_SHIFT, GREATER_EQUAL, GREATER));
  307. return false;
  308. }
  309. char buffer[32];
  310. strncpy(buffer, "unknown token '", 32);
  311. size_t index = strlen(buffer);
  312. index += tokenizer_printChar(c, buffer + index);
  313. buffer[index] = '\'';
  314. buffer[index + 1] = '\0';
  315. tokenizer_onError(buffer, line);
  316. return true;
  317. }
  318. static bool tokenizer_handleChar(char32_t c) {
  319. if(tokenizer_isValidNameStart(c)) {
  320. return tokenizer_handleLiteral(c, LITERAL);
  321. } else if(tokenizer_isDigit(c)) {
  322. return tokenizer_handleNumber(c);
  323. }
  324. return tokenizer_handleSpecial(c);
  325. }
  326. bool tokenize(TokenStream* tokenStream, const char* inputPath) {
  327. input = fopen(inputPath, "r");
  328. if(input == NULL) {
  329. return true;
  330. }
  331. tokens = tokenStream;
  332. line = 1;
  333. buffer = 0;
  334. char32_t c;
  335. while(tokenizer_next(&c)) {
  336. if(tokenizer_handleChar(c)) {
  337. return true;
  338. }
  339. }
  340. tokenizer_addToken(EOF_TOKEN);
  341. fclose(input);
  342. input = NULL;
  343. return false;
  344. }