Tokenizer.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #include <limits.h>
  2. #include <stdarg.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #include "Tokenizer.h"
  7. #include "Utils.h"
  8. #define TOKEN_BUFFER_LENGTH (1024 * 1024)
  9. #define ERROR_LENGTH 256
  10. static char tokenBuffer[TOKEN_BUFFER_LENGTH];
  11. static int writeIndex = 0;
  12. static int readIndex = 0;
  13. static int line = 1;
  14. static FILE* file = NULL;
  15. static char error[ERROR_LENGTH] = {'\0'};
  16. static void tError(const char* format, ...) {
  17. va_list args;
  18. va_start(args, format);
  19. vsnprintf(error, ERROR_LENGTH, format, args);
  20. va_end(args);
  21. }
  22. static bool tAdd(const void* data, int length) {
  23. if(writeIndex + length > TOKEN_BUFFER_LENGTH) {
  24. return false;
  25. }
  26. memcpy(tokenBuffer + writeIndex, data, length);
  27. writeIndex += length;
  28. return true;
  29. }
  30. static bool tAddToken(Token token) {
  31. unsigned char c = token;
  32. return tAdd(&c, 1) && tAdd(&line, sizeof(int));
  33. }
  34. static bool tReadTokens(void* dest, int length) {
  35. if(readIndex + length > writeIndex) {
  36. return false;
  37. }
  38. memcpy(dest, tokenBuffer + readIndex, length);
  39. readIndex += length;
  40. return true;
  41. }
  42. static int tRead() {
  43. return fgetc(file);
  44. }
  45. static int tPeek() {
  46. int c = tRead();
  47. ungetc(c, file);
  48. return c;
  49. }
  50. static bool tParseLiteral(int c) {
  51. int index = 1;
  52. char buffer[64];
  53. buffer[0] = c;
  54. while(isLetter(tPeek())) {
  55. if(index >= 63) {
  56. tError("literal is too long");
  57. return false;
  58. }
  59. buffer[index++] = tRead();
  60. }
  61. buffer[index] = '\0';
  62. if(strcmp(buffer, "print") == 0) {
  63. return tAddToken(T_PRINT);
  64. } else if(strcmp(buffer, "null") == 0) {
  65. return tAddToken(T_NULL);
  66. } else if(strcmp(buffer, "true") == 0) {
  67. return tAddToken(T_TRUE);
  68. } else if(strcmp(buffer, "false") == 0) {
  69. return tAddToken(T_FALSE);
  70. }
  71. return true;
  72. }
  73. static bool tParseNumber(int c) {
  74. int index = 1;
  75. char buffer[64];
  76. buffer[0] = c;
  77. bool point = false;
  78. while(true) {
  79. int c = tPeek();
  80. if(c == '.') {
  81. point = true;
  82. } else if(!isNumber(c)) {
  83. break;
  84. } else if(index >= 63) {
  85. tError("number is too long");
  86. return false;
  87. }
  88. buffer[index++] = tRead();
  89. }
  90. buffer[index] = '\0';
  91. if(point) {
  92. char* end = NULL;
  93. float f = strtof(buffer, &end);
  94. if(end[0] != '\0') {
  95. tError("invalid float on line %d", line);
  96. return false;
  97. }
  98. return tAddToken(T_FLOAT) && tAdd(&f, sizeof(float));
  99. } else {
  100. char* end = NULL;
  101. long l = strtol(buffer, &end, 10);
  102. if(end[0] != '\0' || l > INT_MAX) {
  103. tError("invalid int on line %d", line);
  104. return false;
  105. }
  106. int i = l;
  107. return tAddToken(T_INT) && tAdd(&i, sizeof(int));
  108. }
  109. }
  110. static bool tParseToken() {
  111. int c = tRead();
  112. if(c == EOF) {
  113. return false;
  114. } else if(isLetter(c)) {
  115. return tParseLiteral(c);
  116. } else if(isNumber(c)) {
  117. return tParseNumber(c);
  118. }
  119. switch(c) {
  120. case ' ': return true;
  121. case '\n': line++; return true;
  122. case '+': return tAddToken(T_ADD);
  123. case '*': return tAddToken(T_MUL);
  124. case ';': return tAddToken(T_SEMICOLON);
  125. case '(': return tAddToken(T_OPEN_BRACKET);
  126. case ')': return tAddToken(T_CLOSE_BRACKET);
  127. }
  128. tError("unknown character on line %d: %c", line, c);
  129. return false;
  130. }
  131. static void tParseFile() {
  132. readIndex = 0;
  133. writeIndex = 0;
  134. line = 1;
  135. error[0] = '\0';
  136. while(tParseToken()) {
  137. }
  138. }
  139. bool tTokenize(const char* path) {
  140. file = fopen(path, "r");
  141. if(file == NULL) {
  142. tError("cannot read file '%s'", path);
  143. return true;
  144. }
  145. tParseFile();
  146. fclose(file);
  147. return error[0] != '\0';
  148. }
  149. const char* tGetError() {
  150. return error;
  151. }
  152. void tResetReader() {
  153. readIndex = 0;
  154. }
  155. Token tPeekToken() {
  156. if(readIndex >= writeIndex) {
  157. return T_END;
  158. }
  159. return tokenBuffer[readIndex];
  160. }
  161. Token tReadToken() {
  162. if(readIndex >= writeIndex) {
  163. return T_END;
  164. }
  165. return tokenBuffer[readIndex++];
  166. }
  167. bool tReadInt(int* i) {
  168. if(tReadTokens(i, sizeof(int))) {
  169. return true;
  170. }
  171. return false;
  172. }
  173. bool tReadFloat(float* f) {
  174. if(tReadTokens(f, sizeof(float))) {
  175. return true;
  176. }
  177. return false;
  178. }
  179. const char* tGetTokenName(Token token) {
  180. switch(token) {
  181. case T_INT: return "int";
  182. case T_FLOAT: return "float";
  183. case T_NULL: return "null";
  184. case T_TRUE: return "true";
  185. case T_FALSE: return "false";
  186. case T_ADD: return "+";
  187. case T_MUL: return "*";
  188. case T_PRINT: return "print";
  189. case T_SEMICOLON: return ";";
  190. case T_OPEN_BRACKET: return "(";
  191. case T_CLOSE_BRACKET: return ")";
  192. case T_END: return "end";
  193. }
  194. return "Unknown";
  195. }
  196. void tPrint() {
  197. puts("----------------");
  198. while(true) {
  199. Token t = tReadToken();
  200. if(t == T_END) {
  201. break;
  202. }
  203. int line;
  204. tReadInt(&line);
  205. printf("%d: %s\n", line, tGetTokenName(t));
  206. if(t == T_INT) {
  207. tReadInt(&line);
  208. }
  209. }
  210. tResetReader();
  211. }