refactor: separate tokenize() into preprocessing and tokenizing

author: nsfisis <nsfisis@gmail.com> 2025-06-21 03:16:32 +0900
committer: nsfisis <nsfisis@gmail.com> 2025-08-15 10:04:06 +0900
commit: 64a77c44a5316c19bbc2486e462095f36c718314 (patch)
tree: e1bb33c457a8170520b89a90a379cbb2b97b4537
parent: 45c4953d796cecf1f4c81b52f95dc562aaffbd0f (diff)
download: ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.gz
ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.zst
ducc-64a77c44a5316c19bbc2486e462095f36c718314.zip
1 files changed, 475 insertions, 231 deletions
diff --git a/main.c b/main.c
index 8aae51f..2067de1 100644
--- a/main.c
+++ b/main.c
@@ -66,350 +66,593 @@ int string_equals(const String* s1, const String* s2) {
     return s1->len == s2->len && strncmp(s1->data, s2->data, s1->len) == 0;
 }
 
-enum TokenKind {
-    TokenKind_eof,
+int string_equals_cstr(const String* s1, const char* s2) {
+    size_t s2_len = strlen(s2);
+    return s1->len == s2_len && strncmp(s1->data, s2, s1->len) == 0;
+}
 
-    TokenKind_and,
-    TokenKind_andand,
-    TokenKind_arrow,
-    TokenKind_assign,
-    TokenKind_assign_add,
-    TokenKind_assign_sub,
-    TokenKind_brace_l,
-    TokenKind_brace_r,
-    TokenKind_bracket_l,
-    TokenKind_bracket_r,
-    TokenKind_comma,
-    TokenKind_dot,
-    TokenKind_ellipsis,
-    TokenKind_eq,
-    TokenKind_ge,
-    TokenKind_gt,
-    TokenKind_ident,
-    TokenKind_keyword_break,
-    TokenKind_keyword_char,
-    TokenKind_keyword_const,
-    TokenKind_keyword_continue,
-    TokenKind_keyword_do,
-    TokenKind_keyword_else,
-    TokenKind_keyword_enum,
-    TokenKind_keyword_extern,
-    TokenKind_keyword_for,
-    TokenKind_keyword_if,
-    TokenKind_keyword_int,
-    TokenKind_keyword_long,
-    TokenKind_keyword_return,
-    TokenKind_keyword_sizeof,
-    TokenKind_keyword_struct,
-    TokenKind_keyword_typeof,
-    TokenKind_keyword_void,
-    TokenKind_keyword_while,
-    TokenKind_le,
-    TokenKind_lt,
-    TokenKind_literal_int,
-    TokenKind_literal_str,
-    TokenKind_minus,
-    TokenKind_minusminus,
-    TokenKind_ne,
-    TokenKind_not,
-    TokenKind_oror,
-    TokenKind_paren_l,
-    TokenKind_paren_r,
-    TokenKind_percent,
-    TokenKind_plus,
-    TokenKind_plusplus,
-    TokenKind_semicolon,
-    TokenKind_slash,
-    TokenKind_star,
+enum PpTokenKind {
+    PpTokenKind_eof,
+
+    PpTokenKind_header_name,
+    PpTokenKind_identifier,
+    PpTokenKind_pp_number,
+    PpTokenKind_character_constant,
+    PpTokenKind_string_literal,
+    PpTokenKind_punctuator,
+    PpTokenKind_other,
+    PpTokenKind_whitespace,
 };
-typedef enum TokenKind TokenKind;
+typedef enum PpTokenKind PpTokenKind;
 
-struct Token {
-    TokenKind kind;
+struct PpToken {
+    PpTokenKind kind;
     String raw;
 };
-typedef struct Token Token;
+typedef struct PpToken PpToken;
 
 struct PpDefine {
     String name;
-    Token* tokens;
+    PpToken* tokens;
 };
 typedef struct PpDefine PpDefine;
 
-struct Lexer {
+struct Preprocessor {
     char* src;
     int pos;
-    Token* tokens;
-    int n_tokens;
+    PpToken* pp_tokens;
+    int n_pp_tokens;
     PpDefine* pp_defines;
     int n_pp_defines;
 };
-typedef struct Lexer Lexer;
+typedef struct Preprocessor Preprocessor;
 
-Lexer* lexer_new(char* src) {
-    Lexer* l = calloc(1, sizeof(Lexer));
-    l->src = src;
-    l->tokens = calloc(1024 * 1024, sizeof(Token));
-    l->pp_defines = calloc(1024, sizeof(PpDefine));
-    return l;
+Preprocessor* preprocessor_new(char* src) {
+    Preprocessor* pp = calloc(1, sizeof(Preprocessor));
+    pp->src = src;
+    pp->pp_tokens = calloc(1024 * 1024, sizeof(PpToken));
+    pp->pp_defines = calloc(1024, sizeof(PpDefine));
+    return pp;
 }
 
-int find_pp_define(Lexer* l, String* name) {
+int find_pp_define(Preprocessor* pp, String* name) {
     int i;
-    for (i = 0; i < l->n_pp_defines; ++i) {
-        if (string_equals(&l->pp_defines[i].name, name)) {
+    for (i = 0; i < pp->n_pp_defines; ++i) {
+        if (string_equals(&pp->pp_defines[i].name, name)) {
             return i;
         }
     }
     return -1;
 }
 
-void tokenize_all(Lexer* l) {
+void pp_tokenize_all(Preprocessor* pp) {
     char* buf;
     int ch;
     int start;
-    while (l->src[l->pos]) {
-        Token* tok = l->tokens + l->n_tokens;
-        char c = l->src[l->pos];
-        ++l->pos;
+    while (pp->src[pp->pos]) {
+        PpToken* tok = pp->pp_tokens + pp->n_pp_tokens;
+        char c = pp->src[pp->pos];
+        ++pp->pos;
         if (c == '(') {
-            tok->kind = TokenKind_paren_l;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == ')') {
-            tok->kind = TokenKind_paren_r;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '{') {
-            tok->kind = TokenKind_brace_l;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '}') {
-            tok->kind = TokenKind_brace_r;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '[') {
-            tok->kind = TokenKind_bracket_l;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == ']') {
-            tok->kind = TokenKind_bracket_r;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == ',') {
-            tok->kind = TokenKind_comma;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == ';') {
-            tok->kind = TokenKind_semicolon;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '+') {
-            if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_assign_add;
-            } else if (l->src[l->pos] == '+') {
-                ++l->pos;
-                tok->kind = TokenKind_plusplus;
+            if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            } else if (pp->src[pp->pos] == '+') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_plus;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '|') {
-            ++l->pos;
-            tok->kind = TokenKind_oror;
+            ++pp->pos;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 2;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '&') {
-            if (l->src[l->pos] == '&') {
-                ++l->pos;
-                tok->kind = TokenKind_andand;
+            if (pp->src[pp->pos] == '&') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_and;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '-') {
-            if (l->src[l->pos] == '>') {
-                ++l->pos;
-                tok->kind = TokenKind_arrow;
-            } else if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_assign_sub;
-            } else if (l->src[l->pos] == '-') {
-                ++l->pos;
-                tok->kind = TokenKind_minusminus;
+            if (pp->src[pp->pos] == '>') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            } else if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            } else if (pp->src[pp->pos] == '-') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_minus;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '*') {
-            tok->kind = TokenKind_star;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '/') {
-            tok->kind = TokenKind_slash;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '%') {
-            tok->kind = TokenKind_percent;
+            tok->kind = PpTokenKind_punctuator;
+            tok->raw.len = 1;
+            tok->raw.data = pp->src + pp->pos - tok->raw.len;
         } else if (c == '.') {
-            if (l->src[l->pos] == '.') {
-                ++l->pos;
-                if (l->src[l->pos] == '.') {
-                    ++l->pos;
-                    tok->kind = TokenKind_ellipsis;
+            if (pp->src[pp->pos] == '.') {
+                ++pp->pos;
+                if (pp->src[pp->pos] == '.') {
+                    ++pp->pos;
+                    tok->kind = PpTokenKind_punctuator;
+                    tok->raw.len = 3;
+                    tok->raw.data = pp->src + pp->pos - tok->raw.len;
                 } else {
-                    fatal_error("unknown token: ..");
+                    --pp->pos;
+                    tok->kind = PpTokenKind_punctuator;
+                    tok->raw.len = 1;
+                    tok->raw.data = pp->src + pp->pos - tok->raw.len;
                 }
             } else {
-                tok->kind = TokenKind_dot;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '!') {
-            if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_ne;
+            if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_not;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '=') {
-            if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_eq;
+            if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_assign;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '<') {
-            if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_le;
+            if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_lt;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '>') {
-            if (l->src[l->pos] == '=') {
-                ++l->pos;
-                tok->kind = TokenKind_ge;
+            if (pp->src[pp->pos] == '=') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             } else {
-                tok->kind = TokenKind_gt;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            }
+        } else if (c == '#') {
+            if (pp->src[pp->pos] == '#') {
+                ++pp->pos;
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 2;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            } else {
+                tok->kind = PpTokenKind_punctuator;
+                tok->raw.len = 1;
+                tok->raw.data = pp->src + pp->pos - tok->raw.len;
             }
         } else if (c == '\'') {
-            ch = l->src[l->pos];
+            start = pp->pos - 1;
+            ch = pp->src[pp->pos];
             if (ch == '\\') {
-                ++l->pos;
-                ch = l->src[l->pos];
+                ++pp->pos;
+                ch = pp->src[pp->pos];
                 if (ch == 'n') {
                     ch = '\n';
                 }
             }
-            l->pos += 2;
-            tok->kind = TokenKind_literal_int;
-            buf = calloc(4, sizeof(char));
-            sprintf(buf, "%d", ch);
-            tok->raw.data = buf;
-            tok->raw.len = strlen(buf);
+            pp->pos += 2;
+            tok->kind = PpTokenKind_character_constant;
+            tok->raw.data = pp->src + start;
+            tok->raw.len = pp->pos - start;
         } else if (c == '"') {
-            start = l->pos;
+            start = pp->pos - 1;
             while (1) {
-                ch = l->src[l->pos];
+                ch = pp->src[pp->pos];
                 if (ch == '\\') {
-                    ++l->pos;
+                    ++pp->pos;
                 } else if (ch == '"') {
                     break;
                 }
-                ++l->pos;
+                ++pp->pos;
             }
-            tok->kind = TokenKind_literal_str;
-            tok->raw.data = l->src + start;
-            tok->raw.len = l->pos - start;
-            ++l->pos;
+            ++pp->pos;
+            tok->kind = PpTokenKind_string_literal;
+            tok->raw.data = pp->src + start;
+            tok->raw.len = pp->pos - start;
         } else if (isdigit(c)) {
-            --l->pos;
-            start = l->pos;
-            while (isdigit(l->src[l->pos])) {
-                ++l->pos;
+            --pp->pos;
+            start = pp->pos;
+            while (isdigit(pp->src[pp->pos])) {
+                ++pp->pos;
             }
-            tok->kind = TokenKind_literal_int;
-            tok->raw.data = l->src + start;
-            tok->raw.len = l->pos - start;
+            tok->kind = PpTokenKind_pp_number;
+            tok->raw.data = pp->src + start;
+            tok->raw.len = pp->pos - start;
         } else if (isalpha(c) || c == '_') {
-            --l->pos;
-            start = l->pos;
-            while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') {
-                ++l->pos;
+            --pp->pos;
+            start = pp->pos;
+            while (isalnum(pp->src[pp->pos]) || pp->src[pp->pos] == '_') {
+                ++pp->pos;
+            }
+            tok->raw.data = pp->src + start;
+            tok->raw.len = pp->pos - start;
+            tok->kind = PpTokenKind_identifier;
+        } else if (isspace(c)) {
+            tok->raw.data = pp->src;
+            tok->raw.len = 1;
+            tok->kind = PpTokenKind_whitespace;
+        } else {
+            tok->raw.data = pp->src;
+            tok->raw.len = 1;
+            tok->kind = PpTokenKind_other;
+        }
+        ++pp->n_pp_tokens;
+    }
+}
+
+void pp_execute_pp_directive(Preprocessor* pp) {
+    PpToken* tok = pp->pp_tokens;
+    PpToken* define_dest;
+    while (tok->kind != PpTokenKind_eof) {
+        "TODO: check if the token is at the beginning of line.";
+        "TODO: check if skipped whitespaces do not contain line breaks.";
+        if (tok->kind == PpTokenKind_punctuator && string_equals_cstr(&tok->raw, "#")) {
+            PpToken* tok2 = tok + 1;
+            while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace)
+                ++tok2;
+            if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "define")) {
+                ++tok2;
+                while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace)
+                    ++tok2;
+                if (tok2->kind == PpTokenKind_identifier) {
+                    PpToken* define_name = tok2;
+                    ++tok2;
+                    while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace)
+                        ++tok2;
+                    if (tok2->kind == PpTokenKind_identifier || tok2->kind == PpTokenKind_pp_number) {
+                        define_dest = tok2;
+
+                        pp->pp_defines[pp->n_pp_defines].name.len = define_name->raw.len;
+                        pp->pp_defines[pp->n_pp_defines].name.data = define_name->raw.data;
+                        pp->pp_defines[pp->n_pp_defines].tokens = calloc(1, sizeof(PpToken));
+                        pp->pp_defines[pp->n_pp_defines].tokens[0].kind = define_dest->kind;
+                        pp->pp_defines[pp->n_pp_defines].tokens[0].raw.len = define_dest->raw.len;
+                        pp->pp_defines[pp->n_pp_defines].tokens[0].raw.data = define_dest->raw.data;
+                        ++pp->n_pp_defines;
+                    }
+                }
+            }
+            while (tok != tok2 + 1) {
+                tok->kind = PpTokenKind_whitespace;
+                tok->raw.len = 0;
+                tok->raw.data = NULL;
+                ++tok;
             }
-            int ident_len = l->pos - start;
-            if (ident_len == 5 && strstr(l->src + start, "break") == l->src + start) {
+        } else if (tok->kind == PpTokenKind_identifier) {
+            int pp_define_idx = find_pp_define(pp, &tok->raw);
+            if (pp_define_idx != -1) {
+                define_dest = pp->pp_defines[pp_define_idx].tokens;
+                tok->kind = define_dest->kind;
+                tok->raw.data = define_dest->raw.data;
+                tok->raw.len = define_dest->raw.len;
+            }
+        }
+        ++tok;
+    }
+}
+
+PpToken* preprocess(char* src) {
+    Preprocessor* pp = preprocessor_new(src);
+    pp_tokenize_all(pp);
+    pp_execute_pp_directive(pp);
+    return pp->pp_tokens;
+}
+
+enum TokenKind {
+    TokenKind_eof,
+
+    TokenKind_and,
+    TokenKind_andand,
+    TokenKind_arrow,
+    TokenKind_assign,
+    TokenKind_assign_add,
+    TokenKind_assign_sub,
+    TokenKind_brace_l,
+    TokenKind_brace_r,
+    TokenKind_bracket_l,
+    TokenKind_bracket_r,
+    TokenKind_comma,
+    TokenKind_dot,
+    TokenKind_ellipsis,
+    TokenKind_eq,
+    TokenKind_ge,
+    TokenKind_gt,
+    TokenKind_ident,
+    TokenKind_keyword_break,
+    TokenKind_keyword_char,
+    TokenKind_keyword_const,
+    TokenKind_keyword_continue,
+    TokenKind_keyword_do,
+    TokenKind_keyword_else,
+    TokenKind_keyword_enum,
+    TokenKind_keyword_extern,
+    TokenKind_keyword_for,
+    TokenKind_keyword_if,
+    TokenKind_keyword_int,
+    TokenKind_keyword_long,
+    TokenKind_keyword_return,
+    TokenKind_keyword_sizeof,
+    TokenKind_keyword_struct,
+    TokenKind_keyword_typeof,
+    TokenKind_keyword_void,
+    TokenKind_keyword_while,
+    TokenKind_le,
+    TokenKind_lt,
+    TokenKind_literal_int,
+    TokenKind_literal_str,
+    TokenKind_minus,
+    TokenKind_minusminus,
+    TokenKind_ne,
+    TokenKind_not,
+    TokenKind_oror,
+    TokenKind_paren_l,
+    TokenKind_paren_r,
+    TokenKind_percent,
+    TokenKind_plus,
+    TokenKind_plusplus,
+    TokenKind_semicolon,
+    TokenKind_slash,
+    TokenKind_star,
+};
+typedef enum TokenKind TokenKind;
+
+struct Token {
+    TokenKind kind;
+    String raw;
+};
+typedef struct Token Token;
+
+struct Lexer {
+    PpToken* src;
+    int pos;
+    Token* tokens;
+    int n_tokens;
+};
+typedef struct Lexer Lexer;
+
+Lexer* lexer_new(PpToken* pp_tokens) {
+    Lexer* l = calloc(1, sizeof(Lexer));
+    l->src = pp_tokens;
+    l->tokens = calloc(1024 * 1024, sizeof(Token));
+    return l;
+}
+
+void tokenize_all(Lexer* l) {
+    char* buf;
+    int ch;
+    int start;
+    while (l->src[l->pos].kind != PpTokenKind_eof) {
+        PpToken* pp_tok = l->src + l->pos;
+        Token* tok = l->tokens + l->n_tokens;
+        PpTokenKind k = pp_tok->kind;
+        ++l->pos;
+        if (k == PpTokenKind_header_name) {
+            fatal_error("not implemented yet");
+        } else if (k == PpTokenKind_identifier) {
+            if (string_equals_cstr(&pp_tok->raw, "break")) {
                 tok->kind = TokenKind_keyword_break;
-            } else if (ident_len == 4 && strstr(l->src + start, "char") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "char")) {
                 tok->kind = TokenKind_keyword_char;
-            } else if (ident_len == 5 && strstr(l->src + start, "const") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "const")) {
                 tok->kind = TokenKind_keyword_const;
-            } else if (ident_len == 8 && strstr(l->src + start, "continue") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "continue")) {
                 tok->kind = TokenKind_keyword_continue;
-            } else if (ident_len == 2 && strstr(l->src + start, "do") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "do")) {
                 tok->kind = TokenKind_keyword_do;
-            } else if (ident_len == 4 && strstr(l->src + start, "else") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "else")) {
                 tok->kind = TokenKind_keyword_else;
-            } else if (ident_len == 4 && strstr(l->src + start, "enum") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "enum")) {
                 tok->kind = TokenKind_keyword_enum;
-            } else if (ident_len == 6 && strstr(l->src + start, "extern") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "extern")) {
                 tok->kind = TokenKind_keyword_extern;
-            } else if (ident_len == 3 && strstr(l->src + start, "for") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "for")) {
                 tok->kind = TokenKind_keyword_for;
-            } else if (ident_len == 2 && strstr(l->src + start, "if") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "if")) {
                 tok->kind = TokenKind_keyword_if;
-            } else if (ident_len == 3 && strstr(l->src + start, "int") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "int")) {
                 tok->kind = TokenKind_keyword_int;
-            } else if (ident_len == 4 && strstr(l->src + start, "long") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "long")) {
                 tok->kind = TokenKind_keyword_long;
-            } else if (ident_len == 6 && strstr(l->src + start, "return") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "return")) {
                 tok->kind = TokenKind_keyword_return;
-            } else if (ident_len == 6 && strstr(l->src + start, "sizeof") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "sizeof")) {
                 tok->kind = TokenKind_keyword_sizeof;
-            } else if (ident_len == 6 && strstr(l->src + start, "struct") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "struct")) {
                 tok->kind = TokenKind_keyword_struct;
-            } else if (ident_len == 7 && strstr(l->src + start, "typedef") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "typedef")) {
                 tok->kind = TokenKind_keyword_typeof;
-            } else if (ident_len == 4 && strstr(l->src + start, "void") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "void")) {
                 tok->kind = TokenKind_keyword_void;
-            } else if (ident_len == 5 && strstr(l->src + start, "while") == l->src + start) {
+            } else if (string_equals_cstr(&pp_tok->raw, "while")) {
                 tok->kind = TokenKind_keyword_while;
             } else {
-                tok->raw.data = l->src + start;
-                tok->raw.len = ident_len;
-                int pp_define_idx = find_pp_define(l, &tok->raw);
-                if (pp_define_idx == -1) {
-                    tok->kind = TokenKind_ident;
-                } else {
-                    tok->kind = l->pp_defines[pp_define_idx].tokens->kind;
-                    tok->raw.data = l->pp_defines[pp_define_idx].tokens->raw.data;
-                    tok->raw.len = l->pp_defines[pp_define_idx].tokens->raw.len;
-                }
+                tok->kind = TokenKind_ident;
             }
-        } else if (isspace(c)) {
-            continue;
-        } else if (c == '#') {
-            l->pos += 6;
-            while (isspace(l->src[l->pos])) {
-                ++l->pos;
-            }
-            start = l->pos;
-            while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') {
-                ++l->pos;
-            }
-            PpDefine* def = l->pp_defines + l->n_pp_defines;
-            ++l->n_pp_defines;
-            def->name.data = l->src + start;
-            def->name.len = l->pos - start;
-            while (isspace(l->src[l->pos])) {
-                ++l->pos;
-            }
-            int start2 = l->pos;
-            int is_digit = isdigit(l->src[l->pos]);
-            if (is_digit) {
-                while (isdigit(l->src[l->pos])) {
-                    ++l->pos;
-                }
-            } else {
-                while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') {
-                    ++l->pos;
+            tok->raw.data = pp_tok->raw.data;
+            tok->raw.len = pp_tok->raw.len;
+        } else if (k == PpTokenKind_pp_number) {
+            tok->kind = TokenKind_literal_int;
+            tok->raw.data = pp_tok->raw.data;
+            tok->raw.len = pp_tok->raw.len;
+        } else if (k == PpTokenKind_character_constant) {
+            tok->kind = TokenKind_literal_int;
+            ch = pp_tok->raw.data[1];
+            if (ch == '\\') {
+                ch = pp_tok->raw.data[2];
+                if (ch == 'n') {
+                    ch = '\n';
                 }
             }
-            def->tokens = calloc(1, sizeof(Token));
-            if (is_digit) {
-                def->tokens->kind = TokenKind_literal_int;
+            buf = calloc(4, sizeof(char));
+            sprintf(buf, "%d", ch);
+            tok->raw.data = buf;
+            tok->raw.len = strlen(buf);
+        } else if (k == PpTokenKind_string_literal) {
+            tok->kind = TokenKind_literal_str;
+            tok->raw.data = pp_tok->raw.data + 1;
+            tok->raw.len = pp_tok->raw.len - 2;
+        } else if (k == PpTokenKind_punctuator || k == PpTokenKind_other) {
+            if (string_equals_cstr(&pp_tok->raw, "(")) {
+                tok->kind = TokenKind_paren_l;
+            } else if (string_equals_cstr(&pp_tok->raw, ")")) {
+                tok->kind = TokenKind_paren_r;
+            } else if (string_equals_cstr(&pp_tok->raw, "{")) {
+                tok->kind = TokenKind_brace_l;
+            } else if (string_equals_cstr(&pp_tok->raw, "}")) {
+                tok->kind = TokenKind_brace_r;
+            } else if (string_equals_cstr(&pp_tok->raw, "[")) {
+                tok->kind = TokenKind_bracket_l;
+            } else if (string_equals_cstr(&pp_tok->raw, "]")) {
+                tok->kind = TokenKind_bracket_r;
+            } else if (string_equals_cstr(&pp_tok->raw, ",")) {
+                tok->kind = TokenKind_comma;
+            } else if (string_equals_cstr(&pp_tok->raw, ";")) {
+                tok->kind = TokenKind_semicolon;
+            } else if (string_equals_cstr(&pp_tok->raw, "+=")) {
+                tok->kind = TokenKind_assign_add;
+            } else if (string_equals_cstr(&pp_tok->raw, "++")) {
+                tok->kind = TokenKind_plusplus;
+            } else if (string_equals_cstr(&pp_tok->raw, "+")) {
+                tok->kind = TokenKind_plus;
+            } else if (string_equals_cstr(&pp_tok->raw, "||")) {
+                tok->kind = TokenKind_oror;
+            } else if (string_equals_cstr(&pp_tok->raw, "&&")) {
+                tok->kind = TokenKind_andand;
+            } else if (string_equals_cstr(&pp_tok->raw, "&")) {
+                tok->kind = TokenKind_and;
+            } else if (string_equals_cstr(&pp_tok->raw, "->")) {
+                tok->kind = TokenKind_arrow;
+            } else if (string_equals_cstr(&pp_tok->raw, "-=")) {
+                tok->kind = TokenKind_assign_sub;
+            } else if (string_equals_cstr(&pp_tok->raw, "--")) {
+                tok->kind = TokenKind_minusminus;
+            } else if (string_equals_cstr(&pp_tok->raw, "-")) {
+                tok->kind = TokenKind_minus;
+            } else if (string_equals_cstr(&pp_tok->raw, "*")) {
+                tok->kind = TokenKind_star;
+            } else if (string_equals_cstr(&pp_tok->raw, "/")) {
+                tok->kind = TokenKind_slash;
+            } else if (string_equals_cstr(&pp_tok->raw, "%")) {
+                tok->kind = TokenKind_percent;
+            } else if (string_equals_cstr(&pp_tok->raw, "...")) {
+                tok->kind = TokenKind_ellipsis;
+            } else if (string_equals_cstr(&pp_tok->raw, ".")) {
+                tok->kind = TokenKind_dot;
+            } else if (string_equals_cstr(&pp_tok->raw, "!=")) {
+                tok->kind = TokenKind_ne;
+            } else if (string_equals_cstr(&pp_tok->raw, "!")) {
+                tok->kind = TokenKind_not;
+            } else if (string_equals_cstr(&pp_tok->raw, "==")) {
+                tok->kind = TokenKind_eq;
+            } else if (string_equals_cstr(&pp_tok->raw, "=")) {
+                tok->kind = TokenKind_assign;
+            } else if (string_equals_cstr(&pp_tok->raw, "<=")) {
+                tok->kind = TokenKind_le;
+            } else if (string_equals_cstr(&pp_tok->raw, "<")) {
+                tok->kind = TokenKind_lt;
+            } else if (string_equals_cstr(&pp_tok->raw, ">=")) {
+                tok->kind = TokenKind_ge;
+            } else if (string_equals_cstr(&pp_tok->raw, ">")) {
+                tok->kind = TokenKind_gt;
             } else {
-                def->tokens->kind = TokenKind_ident;
+                sprintf(buf, "unknown token: %.*s", pp_tok->raw.len, pp_tok->raw.data);
+                fatal_error(buf);
             }
-            def->tokens->raw.data = l->src + start2;
-            def->tokens->raw.len = l->pos - start2;
+            tok->raw.data = pp_tok->raw.data;
+            tok->raw.len = pp_tok->raw.len;
+        } else if (k == PpTokenKind_whitespace) {
             continue;
-        } else {
-            buf = calloc(1024, sizeof(char));
-            sprintf(buf, "unknown token char(%d)", c);
-            fatal_error(buf);
         }
         ++l->n_tokens;
     }
 }
 
-Token* tokenize(char* src) {
-    Lexer* l = lexer_new(src);
+Token* tokenize(PpToken* pp_tokens) {
+    Lexer* l = lexer_new(pp_tokens);
     tokenize_all(l);
     return l->tokens;
 }
@@ -2223,7 +2466,8 @@ int main(int argc, char** argv) {
         in = fopen(argv[1], "rb");
     }
     char* source = read_all(in);
-    Token* tokens = tokenize(source);
+    PpToken* pp_tokens = preprocess(source);
+    Token* tokens = tokenize(pp_tokens);
     Program* prog = parse(tokens);
     analyze(prog);
     codegen(prog);
author	nsfisis <nsfisis@gmail.com>	2025-06-21 03:16:32 +0900
committer	nsfisis <nsfisis@gmail.com>	2025-08-15 10:04:06 +0900
commit	64a77c44a5316c19bbc2486e462095f36c718314 (patch)
tree	e1bb33c457a8170520b89a90a379cbb2b97b4537
parent	45c4953d796cecf1f4c81b52f95dc562aaffbd0f (diff)
download	ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.gz ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.zst ducc-64a77c44a5316c19bbc2486e462095f36c718314.zip