refactor: extract tokenization from Preprocessor

author: nsfisis <nsfisis@gmail.com> 2025-08-16 00:47:59 +0900
committer: nsfisis <nsfisis@gmail.com> 2025-08-16 02:01:20 +0900
commit: c1f7732c1902745180e77d0abcf73714cb2e2ead (patch)
tree: 7f93f3556fabff3d6d3dc1998bac0b900a99d76c /preprocess.c
parent: e2064554b6d653439fbbb2bcde00e6f1a1079cb4 (diff)
download: ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.tar.gz
ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.tar.zst
ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.zip
1 files changed, 355 insertions, 337 deletions
diff --git a/preprocess.c b/preprocess.c
index d82bbc4..468ff3e 100644
--- a/preprocess.c
+++ b/preprocess.c
@@ -444,76 +444,34 @@ void add_predefined_macros(MacroArray* macros) {
     m->name.data = "__LINE__";
 }
 
-struct Preprocessor {
+struct PpLexer {
     const char* filename;
     int line;
     char* src;
     int pos;
     TokenArray* pp_tokens;
-    MacroArray* macros;
-    int include_depth;
-    BOOL skip_pp_tokens;
-    String* include_paths;
-    int n_include_paths;
 };
-typedef struct Preprocessor Preprocessor;
-
-TokenArray* do_preprocess(InFile* src, int depth, MacroArray* macros);
-
-Preprocessor* preprocessor_new(InFile* src, int include_depth, MacroArray* macros) {
-    if (include_depth >= 32) {
-        fatal_error("include depth limit exceeded");
-    }
-
-    Preprocessor* pp = calloc(1, sizeof(Preprocessor));
-    pp->filename = src->filename;
-    pp->line = 1;
-    pp->src = src->buf;
-    pp->pp_tokens = calloc(1, sizeof(TokenArray));
-    tokens_init(pp->pp_tokens, 1024 * 16);
-    pp->macros = macros;
-    pp->include_depth = include_depth;
-    pp->include_paths = calloc(16, sizeof(String));
+typedef struct PpLexer PpLexer;
 
-    return pp;
-}
+PpLexer* pplexer_new(InFile* src) {
+    PpLexer* ppl = calloc(1, sizeof(PpLexer));
 
-Token* pp_token_at(Preprocessor* pp, int i) {
-    return &pp->pp_tokens->data[i];
-}
+    ppl->filename = src->filename;
+    ppl->line = 1;
+    ppl->src = src->buf;
+    ppl->pp_tokens = calloc(1, sizeof(TokenArray));
+    tokens_init(ppl->pp_tokens, 1024 * 16);
 
-int find_macro(Preprocessor* pp, String* name) {
-    for (int i = 0; i < pp->macros->len; ++i) {
-        if (string_equals(&pp->macros->data[i].name, name)) {
-            return i;
-        }
-    }
-    return -1;
+    return ppl;
 }
 
-void undef_macro(Preprocessor* pp, int idx) {
-    pp->macros->data[idx].name.len = 0;
-    // TODO: Can predefined macro like __FILE__ be undefined?
-}
-
-void add_include_path(Preprocessor* pp, char* include_path) {
-    pp->include_paths[pp->n_include_paths].data = include_path;
-    pp->include_paths[pp->n_include_paths].len = strlen(include_path);
-    ++pp->n_include_paths;
-}
-
-BOOL skip_pp_tokens(Preprocessor* pp) {
-    // TODO: support nested #if
-    return pp->skip_pp_tokens;
-}
-
-void pp_tokenize_all(Preprocessor* pp) {
-    while (pp->src[pp->pos]) {
-        Token* tok = tokens_push_new(pp->pp_tokens);
-        tok->loc.filename = pp->filename;
-        tok->loc.line = pp->line;
-        char c = pp->src[pp->pos];
-        ++pp->pos;
+void pplexer_tokenize_all(PpLexer* ppl) {
+    while (ppl->src[ppl->pos]) {
+        Token* tok = tokens_push_new(ppl->pp_tokens);
+        tok->loc.filename = ppl->filename;
+        tok->loc.line = ppl->line;
+        char c = ppl->src[ppl->pos];
+        ++ppl->pos;
         if (c == '(') {
             tok->kind = TokenKind_paren_l;
         } else if (c == ')') {
@@ -533,8 +491,8 @@ void pp_tokenize_all(Preprocessor* pp) {
         } else if (c == ';') {
             tok->kind = TokenKind_semicolon;
         } else if (c == '^') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_xor;
             } else {
                 tok->kind = TokenKind_xor;
@@ -544,132 +502,132 @@ void pp_tokenize_all(Preprocessor* pp) {
         } else if (c == '~') {
             tok->kind = TokenKind_tilde;
         } else if (c == '+') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_add;
-            } else if (pp->src[pp->pos] == '+') {
-                ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '+') {
+                ++ppl->pos;
                 tok->kind = TokenKind_plusplus;
             } else {
                 tok->kind = TokenKind_plus;
             }
         } else if (c == '|') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_or;
-            } else if (pp->src[pp->pos] == '|') {
-                ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '|') {
+                ++ppl->pos;
                 tok->kind = TokenKind_oror;
             } else {
                 tok->kind = TokenKind_or;
             }
         } else if (c == '&') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_and;
-            } else if (pp->src[pp->pos] == '&') {
-                ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '&') {
+                ++ppl->pos;
                 tok->kind = TokenKind_andand;
             } else {
                 tok->kind = TokenKind_and;
             }
         } else if (c == '-') {
-            if (pp->src[pp->pos] == '>') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '>') {
+                ++ppl->pos;
                 tok->kind = TokenKind_arrow;
-            } else if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_sub;
-            } else if (pp->src[pp->pos] == '-') {
-                ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '-') {
+                ++ppl->pos;
                 tok->kind = TokenKind_minusminus;
             } else {
                 tok->kind = TokenKind_minus;
             }
         } else if (c == '*') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_mul;
             } else {
                 tok->kind = TokenKind_star;
             }
         } else if (c == '/') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_div;
-            } else if (pp->src[pp->pos] == '/') {
-                int start = pp->pos - 1;
-                ++pp->pos;
-                while (pp->src[pp->pos] && pp->src[pp->pos] != '\n' && pp->src[pp->pos] != '\r') {
-                    ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '/') {
+                int start = ppl->pos - 1;
+                ++ppl->pos;
+                while (ppl->src[ppl->pos] && ppl->src[ppl->pos] != '\n' && ppl->src[ppl->pos] != '\r') {
+                    ++ppl->pos;
                 }
                 tok->kind = TokenKind_whitespace;
-                tok->raw.len = pp->pos - start;
-                tok->raw.data = pp->src + pp->pos - tok->raw.len;
-            } else if (pp->src[pp->pos] == '*') {
-                int start = pp->pos - 1;
-                ++pp->pos;
-                while (pp->src[pp->pos]) {
-                    if (pp->src[pp->pos] == '*' && pp->src[pp->pos + 1] == '/') {
-                        pp->pos += 2;
+                tok->raw.len = ppl->pos - start;
+                tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
+            } else if (ppl->src[ppl->pos] == '*') {
+                int start = ppl->pos - 1;
+                ++ppl->pos;
+                while (ppl->src[ppl->pos]) {
+                    if (ppl->src[ppl->pos] == '*' && ppl->src[ppl->pos + 1] == '/') {
+                        ppl->pos += 2;
                         break;
                     }
-                    if (pp->src[pp->pos] == '\n') {
-                        ++pp->line;
+                    if (ppl->src[ppl->pos] == '\n') {
+                        ++ppl->line;
                     }
-                    ++pp->pos;
+                    ++ppl->pos;
                 }
                 tok->kind = TokenKind_whitespace;
-                tok->raw.len = pp->pos - start;
-                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+                tok->raw.len = ppl->pos - start;
+                tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
             } else {
                 tok->kind = TokenKind_slash;
             }
         } else if (c == '%') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_assign_mod;
             } else {
                 tok->kind = TokenKind_percent;
             }
         } else if (c == '.') {
-            if (pp->src[pp->pos] == '.') {
-                ++pp->pos;
-                if (pp->src[pp->pos] == '.') {
-                    ++pp->pos;
+            if (ppl->src[ppl->pos] == '.') {
+                ++ppl->pos;
+                if (ppl->src[ppl->pos] == '.') {
+                    ++ppl->pos;
                     tok->kind = TokenKind_ellipsis;
                 } else {
                     tok->kind = TokenKind_other;
                     tok->raw.len = 2;
-                    tok->raw.data = pp->src + pp->pos - tok->raw.len;
+                    tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
                 }
             } else {
                 tok->kind = TokenKind_dot;
                 tok->raw.len = 1;
-                tok->raw.data = pp->src + pp->pos - tok->raw.len;
+                tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
             }
         } else if (c == '!') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_ne;
             } else {
                 tok->kind = TokenKind_not;
             }
         } else if (c == '=') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_eq;
             } else {
                 tok->kind = TokenKind_assign;
             }
         } else if (c == '<') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_le;
-            } else if (pp->src[pp->pos] == '<') {
-                ++pp->pos;
-                if (pp->src[pp->pos] == '=') {
-                    ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '<') {
+                ++ppl->pos;
+                if (ppl->src[ppl->pos] == '=') {
+                    ++ppl->pos;
                     tok->kind = TokenKind_assign_lshift;
                 } else {
                     tok->kind = TokenKind_lshift;
@@ -678,13 +636,13 @@ void pp_tokenize_all(Preprocessor* pp) {
                 tok->kind = TokenKind_lt;
             }
         } else if (c == '>') {
-            if (pp->src[pp->pos] == '=') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '=') {
+                ++ppl->pos;
                 tok->kind = TokenKind_ge;
-            } else if (pp->src[pp->pos] == '>') {
-                ++pp->pos;
-                if (pp->src[pp->pos] == '=') {
-                    ++pp->pos;
+            } else if (ppl->src[ppl->pos] == '>') {
+                ++ppl->pos;
+                if (ppl->src[ppl->pos] == '=') {
+                    ++ppl->pos;
                     tok->kind = TokenKind_assign_rshift;
                 } else {
                     tok->kind = TokenKind_rshift;
@@ -693,53 +651,53 @@ void pp_tokenize_all(Preprocessor* pp) {
                 tok->kind = TokenKind_gt;
             }
         } else if (c == '#') {
-            if (pp->src[pp->pos] == '#') {
-                ++pp->pos;
+            if (ppl->src[ppl->pos] == '#') {
+                ++ppl->pos;
                 tok->kind = TokenKind_hashhash;
             } else {
                 tok->kind = TokenKind_hash;
             }
         } else if (c == '\'') {
-            int start = pp->pos - 1;
-            if (pp->src[pp->pos] == '\\') {
-                ++pp->pos;
+            int start = ppl->pos - 1;
+            if (ppl->src[ppl->pos] == '\\') {
+                ++ppl->pos;
             }
-            pp->pos += 2;
+            ppl->pos += 2;
             tok->kind = TokenKind_character_constant;
-            tok->raw.data = pp->src + start;
-            tok->raw.len = pp->pos - start;
+            tok->raw.data = ppl->src + start;
+            tok->raw.len = ppl->pos - start;
         } else if (c == '"') {
-            int start = pp->pos - 1;
+            int start = ppl->pos - 1;
             while (1) {
-                char ch = pp->src[pp->pos];
+                char ch = ppl->src[ppl->pos];
                 if (ch == '\\') {
-                    ++pp->pos;
+                    ++ppl->pos;
                 } else if (ch == '"') {
                     break;
                 }
-                ++pp->pos;
+                ++ppl->pos;
             }
-            ++pp->pos;
+            ++ppl->pos;
             tok->kind = TokenKind_literal_str;
-            tok->raw.data = pp->src + start;
-            tok->raw.len = pp->pos - start;
+            tok->raw.data = ppl->src + start;
+            tok->raw.len = ppl->pos - start;
         } else if (isdigit(c)) {
-            --pp->pos;
-            int start = pp->pos;
-            while (isdigit(pp->src[pp->pos])) {
-                ++pp->pos;
+            --ppl->pos;
+            int start = ppl->pos;
+            while (isdigit(ppl->src[ppl->pos])) {
+                ++ppl->pos;
             }
             tok->kind = TokenKind_literal_int;
-            tok->raw.data = pp->src + start;
-            tok->raw.len = pp->pos - start;
+            tok->raw.data = ppl->src + start;
+            tok->raw.len = ppl->pos - start;
         } else if (isalpha(c) || c == '_') {
-            --pp->pos;
-            int start = pp->pos;
-            while (isalnum(pp->src[pp->pos]) || pp->src[pp->pos] == '_') {
-                ++pp->pos;
+            --ppl->pos;
+            int start = ppl->pos;
+            while (isalnum(ppl->src[ppl->pos]) || ppl->src[ppl->pos] == '_') {
+                ++ppl->pos;
             }
-            tok->raw.data = pp->src + start;
-            tok->raw.len = pp->pos - start;
+            tok->raw.data = ppl->src + start;
+            tok->raw.len = ppl->pos - start;
             if (string_equals_cstr(&tok->raw, "auto")) {
                 tok->kind = TokenKind_keyword_auto;
             } else if (string_equals_cstr(&tok->raw, "break")) {
@@ -821,30 +779,101 @@ void pp_tokenize_all(Preprocessor* pp) {
             }
         } else if (isspace(c)) {
             if (c == '\n' || c == '\r') {
-                ++pp->line;
+                ++ppl->line;
             }
             tok->kind = TokenKind_whitespace;
             tok->raw.len = 1;
-            tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
         } else {
             tok->kind = TokenKind_other;
             tok->raw.len = 1;
-            tok->raw.data = pp->src + pp->pos - tok->raw.len;
+            tok->raw.data = ppl->src + ppl->pos - tok->raw.len;
         }
     }
-    Token* eof_tok = tokens_push_new(pp->pp_tokens);
-    eof_tok->loc.filename = pp->filename;
-    eof_tok->loc.line = pp->line;
+    Token* eof_tok = tokens_push_new(ppl->pp_tokens);
+    eof_tok->loc.filename = ppl->filename;
+    eof_tok->loc.line = ppl->line;
     eof_tok->kind = TokenKind_eof;
 }
 
-int skip_whitespace(Preprocessor* pp, int pos) {
-    for (; pos < pp->pp_tokens->len; ++pos) {
-        if (pp_token_at(pp, pos)->kind != TokenKind_whitespace) {
-            break;
+TokenArray* pp_tokenize(InFile* src) {
+    PpLexer* ppl = pplexer_new(src);
+    pplexer_tokenize_all(ppl);
+    return ppl->pp_tokens;
+}
+
+struct Preprocessor {
+    TokenArray* pp_tokens;
+    int pos;
+    MacroArray* macros;
+    int include_depth;
+    BOOL skip_pp_tokens;
+    String* include_paths;
+    int n_include_paths;
+};
+typedef struct Preprocessor Preprocessor;
+
+TokenArray* do_preprocess(InFile* src, int depth, MacroArray* macros);
+
+Preprocessor* preprocessor_new(TokenArray* pp_tokens, int include_depth, MacroArray* macros) {
+    if (include_depth >= 32) {
+        fatal_error("include depth limit exceeded");
+    }
+
+    Preprocessor* pp = calloc(1, sizeof(Preprocessor));
+    pp->pp_tokens = pp_tokens;
+    pp->macros = macros;
+    pp->include_depth = include_depth;
+    pp->include_paths = calloc(16, sizeof(String));
+
+    return pp;
+}
+
+Token* pp_token_at(Preprocessor* pp, int i) {
+    return &pp->pp_tokens->data[i];
+}
+
+Token* peek_pp_token(Preprocessor* pp) {
+    return pp_token_at(pp, pp->pos);
+}
+
+Token* next_pp_token(Preprocessor* pp) {
+    return pp_token_at(pp, pp->pos++);
+}
+
+BOOL pp_eof(Preprocessor* pp) {
+    return peek_pp_token(pp)->kind == TokenKind_eof;
+}
+
+int find_macro(Preprocessor* pp, String* name) {
+    for (int i = 0; i < pp->macros->len; ++i) {
+        if (string_equals(&pp->macros->data[i].name, name)) {
+            return i;
         }
     }
-    return pos;
+    return -1;
+}
+
+void undef_macro(Preprocessor* pp, int idx) {
+    pp->macros->data[idx].name.len = 0;
+    // TODO: Can predefined macro like __FILE__ be undefined?
+}
+
+void add_include_path(Preprocessor* pp, char* include_path) {
+    pp->include_paths[pp->n_include_paths].data = include_path;
+    pp->include_paths[pp->n_include_paths].len = strlen(include_path);
+    ++pp->n_include_paths;
+}
+
+BOOL skip_pp_tokens(Preprocessor* pp) {
+    // TODO: support nested #if
+    return pp->skip_pp_tokens;
+}
+
+void skip_whitespaces(Preprocessor* pp) {
+    while (!pp_eof(pp) && peek_pp_token(pp)->kind == TokenKind_whitespace) {
+        next_pp_token(pp);
+    }
 }
 
 BOOL string_contains_newline(String* s) {
@@ -856,13 +885,14 @@ BOOL string_contains_newline(String* s) {
     return FALSE;
 }
 
-int find_next_newline(Preprocessor* pp, int pos) {
-    for (; pos < pp->pp_tokens->len; ++pos) {
-        if (pp_token_at(pp, pos)->kind == TokenKind_whitespace && string_contains_newline(&pp_token_at(pp, pos)->raw)) {
-            return pos;
+void seek_to_next_newline(Preprocessor* pp) {
+    while (!pp_eof(pp)) {
+        Token* tok = peek_pp_token(pp);
+        if (tok->kind == TokenKind_whitespace && string_contains_newline(&tok->raw)) {
+            break;
         }
+        next_pp_token(pp);
     }
-    return -1;
 }
 
 void make_token_whitespace(Token* tok) {
@@ -877,75 +907,72 @@ void remove_directive_tokens(Preprocessor* pp, int start, int end) {
     }
 }
 
-int process_endif_directive(Preprocessor* pp, int tok, int tok2) {
-    ++tok2;
+void process_endif_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
     pp->skip_pp_tokens = FALSE;
-    remove_directive_tokens(pp, tok, tok2);
-    return tok2;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int process_else_directive(Preprocessor* pp, int tok, int tok2) {
-    ++tok2;
+void process_else_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
     pp->skip_pp_tokens = !pp->skip_pp_tokens;
-    remove_directive_tokens(pp, tok, tok2);
-    return tok2;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int process_elif_directive(Preprocessor* pp, int tok, int tok2) {
+void process_elif_directive(Preprocessor* pp, int hash_pos) {
     unimplemented();
 }
 
-int process_if_directive(Preprocessor* pp, int tok, int tok2) {
+void process_if_directive(Preprocessor* pp, int hash_pos) {
     unimplemented();
 }
 
-int process_ifdef_directive(Preprocessor* pp, int tok, int tok2) {
-    ++tok2;
-    tok2 = skip_whitespace(pp, tok2);
-    if (pp_token_at(pp, tok2)->kind == TokenKind_ident) {
-        Token* name = pp_token_at(pp, tok2);
-        ++tok2;
-        pp->skip_pp_tokens = find_macro(pp, &name->raw) == -1;
+void process_ifdef_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
+    skip_whitespaces(pp);
+    Token* macro_name = peek_pp_token(pp);
+    if (macro_name->kind == TokenKind_ident) {
+        next_pp_token(pp);
+        pp->skip_pp_tokens = find_macro(pp, &macro_name->raw) == -1;
     }
-    remove_directive_tokens(pp, tok, tok2);
-    return tok2;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int process_ifndef_directive(Preprocessor* pp, int tok, int tok2) {
-    ++tok2;
-    tok2 = skip_whitespace(pp, tok2);
-    if (pp_token_at(pp, tok2)->kind == TokenKind_ident) {
-        Token* name = pp_token_at(pp, tok2);
-        ++tok2;
-        pp->skip_pp_tokens = find_macro(pp, &name->raw) != -1;
+void process_ifndef_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
+    skip_whitespaces(pp);
+    Token* macro_name = peek_pp_token(pp);
+    if (macro_name->kind == TokenKind_ident) {
+        next_pp_token(pp);
+        pp->skip_pp_tokens = find_macro(pp, &macro_name->raw) != -1;
     }
-    remove_directive_tokens(pp, tok, tok2);
-    return tok2;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int read_include_header_name(Preprocessor* pp, int tok2, String* include_name) {
-    if (pp_token_at(pp, tok2)->kind == TokenKind_literal_str) {
-        *include_name = pp_token_at(pp, tok2)->raw;
-        ++tok2;
-        return tok2;
-    } else if (pp_token_at(pp, tok2)->kind == TokenKind_lt) {
-        ++tok2;
-        char* include_name_start = pp_token_at(pp, tok2)->raw.data;
+String* read_include_header_name(Preprocessor* pp) {
+    Token* tok = next_pp_token(pp);
+    if (tok->kind == TokenKind_literal_str) {
+        return &tok->raw;
+    } else if (tok->kind == TokenKind_lt) {
+        char* include_name_start = peek_pp_token(pp)->raw.data;
         int include_name_len = 0;
-        while (pp_token_at(pp, tok2)->kind != TokenKind_eof) {
-            if (pp_token_at(pp, tok2)->kind == TokenKind_gt) {
+        while (!pp_eof(pp)) {
+            if (peek_pp_token(pp)->kind == TokenKind_gt) {
                 break;
             }
-            include_name_len += pp_token_at(pp, tok2)->raw.len;
-            ++tok2;
+            include_name_len += peek_pp_token(pp)->raw.len;
+            next_pp_token(pp);
         }
-        if (pp_token_at(pp, tok2)->kind == TokenKind_eof) {
+        if (pp_eof(pp)) {
             fatal_error("invalid #include: <> not balanced");
         }
-        ++tok2;
+        next_pp_token(pp);
+        String* include_name = calloc(1, sizeof(String));
         include_name->data = include_name_start;
         include_name->len = include_name_len;
-        return tok2;
+        return include_name;
+    } else {
+        unreachable();
     }
 }
 
@@ -975,24 +1002,24 @@ int replace_pp_tokens(Preprocessor* pp, int dest_start, int dest_end, TokenArray
         // Move existing tokens backward to make room.
         shift_amount = source_tokens->len - n_tokens_to_remove;
         tokens_reserve(pp->pp_tokens, pp->pp_tokens->len + shift_amount);
-        memmove(pp->pp_tokens->data + dest_end + shift_amount, pp->pp_tokens->data + dest_end,
+        memmove(pp_token_at(pp, dest_end + shift_amount), pp_token_at(pp, dest_end),
                 n_tokens_after_dest * sizeof(Token));
         pp->pp_tokens->len += shift_amount;
     } else if (source_tokens->len < n_tokens_to_remove) {
         // Move existing tokens forward to reduce room.
         shift_amount = n_tokens_to_remove - source_tokens->len;
-        memmove(pp->pp_tokens->data + dest_start + source_tokens->len, pp->pp_tokens->data + dest_end,
+        memmove(pp_token_at(pp, dest_start + source_tokens->len), pp_token_at(pp, dest_end),
                 n_tokens_after_dest * sizeof(Token));
         pp->pp_tokens->len -= shift_amount;
-        memset(pp->pp_tokens->data + pp->pp_tokens->len, 0, shift_amount * sizeof(Token));
+        memset(pp_token_at(pp, pp->pp_tokens->len), 0, shift_amount * sizeof(Token));
     }
 
-    memcpy(pp->pp_tokens->data + dest_start, source_tokens->data, source_tokens->len * sizeof(Token));
+    memcpy(pp_token_at(pp, dest_start), source_tokens->data, source_tokens->len * sizeof(Token));
 
     return dest_start + source_tokens->len;
 }
 
-int expand_include_directive(Preprocessor* pp, int tok, int tok2, const char* include_name_buf) {
+void expand_include_directive(Preprocessor* pp, int hash_pos, const char* include_name_buf) {
     InFile* include_source = read_all(include_name_buf);
     if (!include_source) {
         fatal_error("cannot open include file: %s", include_name_buf);
@@ -1000,137 +1027,133 @@ int expand_include_directive(Preprocessor* pp, int tok, int tok2, const char* in
 
     TokenArray* include_pp_tokens = do_preprocess(include_source, pp->include_depth + 1, pp->macros);
     tokens_pop(include_pp_tokens); // pop EOF token
-    return replace_pp_tokens(pp, tok, tok2 + 1, include_pp_tokens);
+    pp->pos = replace_pp_tokens(pp, hash_pos, pp->pos, include_pp_tokens);
 }
 
-int process_include_directive(Preprocessor* pp, int tok, int tok2) {
-    ++tok2;
-    tok2 = skip_whitespace(pp, tok2);
-    String* include_name = calloc(1, sizeof(String));
-    tok2 = read_include_header_name(pp, tok2, include_name);
+void process_include_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
+    skip_whitespaces(pp);
+    String* include_name = read_include_header_name(pp);
     const char* include_name_buf = resolve_include_name(pp, include_name);
     if (include_name_buf == NULL) {
         fatal_error("cannot resolve include file name: %.*s", include_name->len, include_name->data);
     }
-    return expand_include_directive(pp, tok, tok2, include_name_buf);
+    expand_include_directive(pp, hash_pos, include_name_buf);
 }
 
-int process_define_directive(Preprocessor* pp, int tok, int tok2) {
-    int tok3 = -1;
-    ++tok2;
-    tok2 = skip_whitespace(pp, tok2);
-    if (pp_token_at(pp, tok2)->kind != TokenKind_ident) {
-        fatal_error("%s:%s: invalid #define syntax", pp_token_at(pp, tok2)->loc.filename,
-                    pp_token_at(pp, tok2)->loc.line);
+void process_define_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
+    skip_whitespaces(pp);
+    Token* macro_name = next_pp_token(pp);
+
+    if (macro_name->kind != TokenKind_ident) {
+        fatal_error("%s:%s: invalid #define syntax", macro_name->loc.filename, macro_name->loc.line);
     }
 
-    Token* macro_name = pp_token_at(pp, tok2);
-    ++tok2;
-    if (pp_token_at(pp, tok2)->kind == TokenKind_paren_l) {
-        ++tok2;
-        if (pp_token_at(pp, tok2)->kind == TokenKind_paren_r) {
-            ++tok2;
-        } else {
-            fatal_error("%s:%d: invalid function-like macro syntax (#define %.*s)", macro_name->loc.filename,
-                        macro_name->loc.line, macro_name->raw.len, macro_name->raw.data);
+    if (peek_pp_token(pp)->kind == TokenKind_paren_l) {
+        next_pp_token(pp);
+        if (peek_pp_token(pp)->kind != TokenKind_paren_r) {
+            unimplemented();
         }
-        tok3 = find_next_newline(pp, tok2);
-        if (tok3 == -1) {
-            fatal_error("%s:%s: invalid #define syntax", pp_token_at(pp, tok3)->loc.filename,
-                        pp_token_at(pp, tok3)->loc.line);
+        next_pp_token(pp);
+        int replacements_start_pos = pp->pos;
+        seek_to_next_newline(pp);
+        if (pp_eof(pp)) {
+            fatal_error("%s:%s: invalid #define syntax");
         }
         Macro* macro = macros_push_new(pp->macros);
         macro->kind = MacroKind_func;
         macro->name = macro_name->raw;
-        int n_replacements = tok3 - tok2;
+        int n_replacements = pp->pos - replacements_start_pos;
         tokens_init(&macro->replacements, n_replacements);
         for (int i = 0; i < n_replacements; ++i) {
-            *tokens_push_new(&macro->replacements) = *pp_token_at(pp, tok2 + i);
+            *tokens_push_new(&macro->replacements) = *pp_token_at(pp, replacements_start_pos + i);
         }
     } else {
-        tok3 = find_next_newline(pp, tok2);
-        if (tok3 == -1) {
-            fatal_error("%s:%s: invalid #define syntax", pp_token_at(pp, tok3)->loc.filename,
-                        pp_token_at(pp, tok3)->loc.line);
+        int replacements_start_pos = pp->pos;
+        seek_to_next_newline(pp);
+        if (pp_eof(pp)) {
+            fatal_error("%s:%s: invalid #define syntax");
         }
         Macro* macro = macros_push_new(pp->macros);
         macro->kind = MacroKind_obj;
         macro->name = macro_name->raw;
-        int n_replacements = tok3 - tok2;
+        int n_replacements = pp->pos - replacements_start_pos;
         tokens_init(&macro->replacements, n_replacements);
         for (int i = 0; i < n_replacements; ++i) {
-            *tokens_push_new(&macro->replacements) = *pp_token_at(pp, tok2 + i);
+            *tokens_push_new(&macro->replacements) = *pp_token_at(pp, replacements_start_pos + i);
         }
     }
-    remove_directive_tokens(pp, tok, tok3);
-    return tok3;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int process_undef_directive(Preprocessor* pp, int tok, int tok2) {
-    tok2 = skip_whitespace(pp, tok2 + 1);
-    if (pp_token_at(pp, tok2)->kind == TokenKind_ident) {
-        Token* macro_name = pp_token_at(pp, tok2);
-        ++tok2;
+void process_undef_directive(Preprocessor* pp, int hash_pos) {
+    next_pp_token(pp);
+    skip_whitespaces(pp);
+    Token* macro_name = peek_pp_token(pp);
+    if (macro_name->kind == TokenKind_ident) {
+        next_pp_token(pp);
         int macro_idx = find_macro(pp, &macro_name->raw);
         if (macro_idx != -1) {
             undef_macro(pp, macro_idx);
         }
     }
-    remove_directive_tokens(pp, tok, tok2);
-    return tok2;
+    remove_directive_tokens(pp, hash_pos, pp->pos);
 }
 
-int process_line_directive(Preprocessor* pp, int tok, int tok2) {
+void process_line_directive(Preprocessor* pp, int hash_pos) {
     unimplemented();
 }
 
-int process_error_directive(Preprocessor* pp, int tok, int tok2) {
+void process_error_directive(Preprocessor* pp, int hash_pos) {
     unimplemented();
 }
 
-int process_pragma_directive(Preprocessor* pp, int tok, int tok2) {
+void process_pragma_directive(Preprocessor* pp, int hash_pos) {
     unimplemented();
 }
 
-BOOL expand_macro(Preprocessor* pp, int tok) {
-    int macro_idx = find_macro(pp, &pp_token_at(pp, tok)->raw);
+BOOL expand_macro(Preprocessor* pp) {
+    int macro_name_pos = pp->pos;
+    Token* macro_name = next_pp_token(pp);
+    int macro_idx = find_macro(pp, &macro_name->raw);
     if (macro_idx == -1) {
         return FALSE;
     }
 
-    SourceLocation original_loc = pp_token_at(pp, tok)->loc;
-    Macro* macro = pp->macros->data + macro_idx;
+    SourceLocation original_loc = macro_name->loc;
+    Macro* macro = &pp->macros->data[macro_idx];
     if (macro->kind == MacroKind_func) {
         // also consume '(' and ')'
-        replace_pp_tokens(pp, tok, tok + 3, &macro->replacements);
+        replace_pp_tokens(pp, macro_name_pos, macro_name_pos + 3, &macro->replacements);
         // Inherit a source location from the original macro token.
         for (int i = 0; i < macro->replacements.len; ++i) {
-            pp_token_at(pp, tok + i)->loc = original_loc;
+            pp_token_at(pp, macro_name_pos + i)->loc = original_loc;
         }
     } else if (macro->kind == MacroKind_obj) {
-        replace_pp_tokens(pp, tok, tok + 1, &macro->replacements);
+        replace_pp_tokens(pp, macro_name_pos, macro_name_pos + 1, &macro->replacements);
         // Inherit a source location from the original macro token.
         for (int i = 0; i < macro->replacements.len; ++i) {
-            pp_token_at(pp, tok + i)->loc = original_loc;
+            pp_token_at(pp, macro_name_pos + i)->loc = original_loc;
         }
     } else if (macro->kind == MacroKind_builtin_file) {
         TokenArray tokens;
         tokens_init(&tokens, 1);
         Token* file_tok = tokens_push_new(&tokens);
         file_tok->kind = TokenKind_literal_str;
-        file_tok->raw.len = strlen(pp_token_at(pp, tok)->loc.filename) + 2;
+        file_tok->raw.len = strlen(macro_name->loc.filename) + 2;
         file_tok->raw.data = calloc(file_tok->raw.len, sizeof(char));
-        sprintf(file_tok->raw.data, "\"%s\"", pp_token_at(pp, tok)->loc.filename);
-        replace_pp_tokens(pp, tok, tok + 1, &tokens);
+        sprintf(file_tok->raw.data, "\"%s\"", macro_name->loc.filename);
+        replace_pp_tokens(pp, macro_name_pos, macro_name_pos + 1, &tokens);
     } else if (macro->kind == MacroKind_builtin_line) {
         TokenArray tokens;
         tokens_init(&tokens, 1);
         Token* line_tok = tokens_push_new(&tokens);
         line_tok->kind = TokenKind_literal_int;
         line_tok->raw.data = calloc(10, sizeof(char));
-        sprintf(line_tok->raw.data, "%d", pp_token_at(pp, tok)->loc.line);
+        sprintf(line_tok->raw.data, "%d", macro_name->loc.line);
         line_tok->raw.len = strlen(line_tok->raw.data);
-        replace_pp_tokens(pp, tok, tok + 1, &tokens);
+        replace_pp_tokens(pp, macro_name_pos, macro_name_pos + 1, &tokens);
     } else {
         unreachable();
     }
@@ -1142,68 +1165,63 @@ BOOL is_pp_hash(Token* t) {
     return t->kind == TokenKind_hash;
 }
 
-void process_pp_directives(Preprocessor* pp) {
-    int tok = 0;
-
-    while (pp_token_at(pp, tok)->kind != TokenKind_eof) {
-        if (is_pp_hash(pp_token_at(pp, tok))) {
-            // TODO: don't skip newline after '#'.
-            int tok2 = skip_whitespace(pp, tok + 1);
-            if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                string_equals_cstr(&pp_token_at(pp, tok2)->raw, "endif")) {
-                tok = process_endif_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_keyword_else) {
-                tok = process_else_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "elif")) {
-                tok = process_elif_directive(pp, tok, tok2);
-            } else if (skip_pp_tokens(pp)) {
-                make_token_whitespace(pp_token_at(pp, tok));
-                ++tok;
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_keyword_if) {
-                tok = process_if_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "ifdef")) {
-                tok = process_ifdef_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "ifndef")) {
-                tok = process_ifndef_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "include")) {
-                tok = process_include_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "define")) {
-                tok = process_define_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "undef")) {
-                tok = process_undef_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "line")) {
-                tok = process_line_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "error")) {
-                tok = process_error_directive(pp, tok, tok2);
-            } else if (pp_token_at(pp, tok2)->kind == TokenKind_ident &&
-                       string_equals_cstr(&pp_token_at(pp, tok2)->raw, "pragma")) {
-                tok = process_pragma_directive(pp, tok, tok2);
-            } else {
-                fatal_error("%s:%d: unknown preprocessor directive (%s)", pp_token_at(pp, tok2)->loc.filename,
-                            pp_token_at(pp, tok2)->loc.line, token_stringify(pp_token_at(pp, tok2)));
-            }
+void process_pp_directive(Preprocessor* pp) {
+    int first_token_pos = pp->pos;
+    Token* first_token = peek_pp_token(pp);
+    if (is_pp_hash(first_token)) {
+        next_pp_token(pp);
+        // TODO: don't skip newline after '#'.
+        skip_whitespaces(pp);
+        Token* next_tok = peek_pp_token(pp);
+        if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "endif")) {
+            process_endif_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_keyword_else) {
+            process_else_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "elif")) {
+            process_elif_directive(pp, first_token_pos);
         } else if (skip_pp_tokens(pp)) {
-            make_token_whitespace(pp_token_at(pp, tok));
-            ++tok;
-        } else if (pp_token_at(pp, tok)->kind == TokenKind_ident) {
-            BOOL expanded = expand_macro(pp, tok);
-            if (expanded) {
-                // A macro may expand to another macro. Re-scan the expanded tokens.
-                // TODO: if the macro is defined recursively, it causes infinite loop.
-            } else {
-                ++tok;
-            }
+            make_token_whitespace(pp_token_at(pp, first_token_pos));
+            make_token_whitespace(next_pp_token(pp));
+        } else if (next_tok->kind == TokenKind_keyword_if) {
+            process_if_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "ifdef")) {
+            process_ifdef_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "ifndef")) {
+            process_ifndef_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "include")) {
+            process_include_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "define")) {
+            process_define_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "undef")) {
+            process_undef_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "line")) {
+            process_line_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "error")) {
+            process_error_directive(pp, first_token_pos);
+        } else if (next_tok->kind == TokenKind_ident && string_equals_cstr(&next_tok->raw, "pragma")) {
+            process_pragma_directive(pp, first_token_pos);
         } else {
-            ++tok;
+            fatal_error("%s:%d: unknown preprocessor directive (%s)", next_tok->loc.filename, next_tok->loc.line,
+                        token_stringify(next_tok));
         }
+    } else if (skip_pp_tokens(pp)) {
+        make_token_whitespace(next_pp_token(pp));
+    } else if (first_token->kind == TokenKind_ident) {
+        BOOL expanded = expand_macro(pp);
+        if (expanded) {
+            // A macro may expand to another macro. Re-scan the expanded tokens.
+            // TODO: if the macro is defined recursively, it causes infinite loop.
+        } else {
+            next_pp_token(pp);
+        }
+    } else {
+        next_pp_token(pp);
+    }
+}
+
+void process_pp_directives(Preprocessor* pp) {
+    while (!pp_eof(pp)) {
+        process_pp_directive(pp);
     }
 }
 
@@ -1224,11 +1242,11 @@ char* get_ducc_include_path() {
 }
 
 TokenArray* do_preprocess(InFile* src, int depth, MacroArray* macros) {
-    Preprocessor* pp = preprocessor_new(src, depth, macros);
+    TokenArray* pp_tokens = pp_tokenize(src);
+    Preprocessor* pp = preprocessor_new(pp_tokens, depth, macros);
     add_include_path(pp, get_ducc_include_path());
     add_include_path(pp, "/usr/include/x86_64-linux-gnu");
     add_include_path(pp, "/usr/include");
-    pp_tokenize_all(pp);
     process_pp_directives(pp);
     return pp->pp_tokens;
 }
author	nsfisis <nsfisis@gmail.com>	2025-08-16 00:47:59 +0900
committer	nsfisis <nsfisis@gmail.com>	2025-08-16 02:01:20 +0900
commit	c1f7732c1902745180e77d0abcf73714cb2e2ead (patch)
tree	7f93f3556fabff3d6d3dc1998bac0b900a99d76c /preprocess.c
parent	e2064554b6d653439fbbb2bcde00e6f1a1079cb4 (diff)
download	ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.tar.gz ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.tar.zst ducc-c1f7732c1902745180e77d0abcf73714cb2e2ead.zip