Tokenizer.c 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #include <stdio.h>
  2. #include <string.h>
  3. #include "Tokenizer.h"
  4. #include "Utils.h"
  5. #define TOKEN_BUFFER_LENGTH (1024 * 1024)
  6. static char tokenBuffer[TOKEN_BUFFER_LENGTH];
  7. static int writeIndex = 0;
  8. static int readIndex = 0;
  9. static const char* TOO_LONG_LITERAL = "literal is too long";
  10. static const char* UNKNOWN_CHARACTER = "unknown character";
  11. static FILE* file = NULL;
  12. static const char* error = NULL;
  13. static bool tAdd(const void* data, int length) {
  14. if(writeIndex + length > TOKEN_BUFFER_LENGTH) {
  15. return false;
  16. }
  17. memcpy(tokenBuffer + writeIndex, data, length);
  18. writeIndex += length;
  19. return true;
  20. }
  21. static bool tAddToken(Token token) {
  22. unsigned char c = token;
  23. return tAdd(&c, 1);
  24. }
  25. static bool tReadTokens(void* dest, int length) {
  26. if(readIndex + length > writeIndex) {
  27. return false;
  28. }
  29. memcpy(dest, tokenBuffer + readIndex, length);
  30. readIndex += length;
  31. return true;
  32. }
  33. static int tRead() {
  34. return fgetc(file);
  35. }
  36. static int tPeek() {
  37. int c = tRead();
  38. ungetc(c, file);
  39. return c;
  40. }
  41. static bool tParseLiteral(int c) {
  42. int index = 1;
  43. char buffer[64];
  44. buffer[0] = c;
  45. while(isLetter(tPeek())) {
  46. if(index >= 63) {
  47. error = TOO_LONG_LITERAL;
  48. return false;
  49. }
  50. buffer[index++] = tRead();
  51. }
  52. buffer[index] = '\0';
  53. if(strcmp(buffer, "print") == 0) {
  54. return tAddToken(T_PRINT);
  55. }
  56. return true;
  57. }
  58. static bool tParseNumber(int c) {
  59. int sum = c - '0';
  60. while(isNumber(tPeek())) {
  61. sum = sum * 10 + (tRead() - '0');
  62. }
  63. return tAddToken(T_INT) && tAdd(&sum, sizeof(int));
  64. }
  65. static bool tParseToken() {
  66. int c = tRead();
  67. if(c == EOF) {
  68. return false;
  69. } else if(isLetter(c)) {
  70. return tParseLiteral(c);
  71. } else if(isNumber(c)) {
  72. return tParseNumber(c);
  73. }
  74. switch(c) {
  75. case ' ':
  76. case '\n': return true;
  77. case '+': return tAddToken(T_ADD);
  78. case ';': return tAddToken(T_SEMICOLON);
  79. }
  80. error = UNKNOWN_CHARACTER;
  81. return false;
  82. }
  83. static void tParseFile() {
  84. readIndex = 0;
  85. writeIndex = 0;
  86. error = NULL;
  87. while(tParseToken()) {
  88. }
  89. }
  90. bool tTokenize(const char* path) {
  91. file = fopen(path, "r");
  92. if(file == NULL) {
  93. error = "cannot read file";
  94. return true;
  95. }
  96. tParseFile();
  97. fclose(file);
  98. return error != NULL;
  99. }
  100. const char* tGetError() {
  101. return error;
  102. }
  103. void tResetReader() {
  104. readIndex = 0;
  105. }
  106. Token tPeekToken() {
  107. if(readIndex >= writeIndex) {
  108. return T_END;
  109. }
  110. return tokenBuffer[readIndex];
  111. }
  112. Token tReadToken() {
  113. if(readIndex >= writeIndex) {
  114. return T_END;
  115. }
  116. return tokenBuffer[readIndex++];
  117. }
  118. bool tReadInt(int* i) {
  119. if(tReadTokens(i, sizeof(int))) {
  120. return true;
  121. }
  122. return false;
  123. }
  124. const char* tGetTokenName(Token token) {
  125. switch(token) {
  126. case T_INT: return "int";
  127. case T_ADD: return "+";
  128. case T_PRINT: return "print";
  129. case T_SEMICOLON: return ";";
  130. case T_END: return "end";
  131. }
  132. return "Unknown";
  133. }