diff options
| author | nsfisis <nsfisis@gmail.com> | 2025-07-21 10:09:00 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2025-08-15 10:04:28 +0900 |
| commit | 6daa56323634e1142f2d22a756a77a74382cf3a7 (patch) | |
| tree | 85355ad1ad0cf5dca837f8b4cd7e6b09eaa7bd9d /preprocess.c | |
| parent | 16bbc2e0d72f94f8061ba294d1776edfe5bf8a55 (diff) | |
| download | ducc-6daa56323634e1142f2d22a756a77a74382cf3a7.tar.gz ducc-6daa56323634e1142f2d22a756a77a74382cf3a7.tar.zst ducc-6daa56323634e1142f2d22a756a77a74382cf3a7.zip | |
feat: separate main.c
Diffstat (limited to 'preprocess.c')
| -rw-r--r-- | preprocess.c | 431 |
1 files changed, 431 insertions, 0 deletions
diff --git a/preprocess.c b/preprocess.c new file mode 100644 index 0000000..202ff40 --- /dev/null +++ b/preprocess.c @@ -0,0 +1,431 @@ +enum PpTokenKind { + PpTokenKind_eof, + + PpTokenKind_header_name, + PpTokenKind_identifier, + PpTokenKind_pp_number, + PpTokenKind_character_constant, + PpTokenKind_string_literal, + PpTokenKind_punctuator, + PpTokenKind_other, + PpTokenKind_whitespace, +}; +typedef enum PpTokenKind PpTokenKind; + +struct PpToken { + PpTokenKind kind; + String raw; +}; +typedef struct PpToken PpToken; + +struct PpDefine { + String name; + PpToken* tokens; +}; +typedef struct PpDefine PpDefine; + +struct PpDefines { + PpDefine* data; + size_t len; +}; +typedef struct PpDefines PpDefines; + +struct Preprocessor { + char* src; + int pos; + PpToken* pp_tokens; + int n_pp_tokens; + PpDefines* pp_defines; + int include_depth; +}; +typedef struct Preprocessor Preprocessor; + +PpToken* do_preprocess(char* src, int depth, PpDefines* pp_defines); + +PpDefines* pp_defines_new() { + PpDefines* pp_defines = calloc(1, sizeof(PpDefines)); + pp_defines->data = calloc(1024, sizeof(PpDefine)); + return pp_defines; +} + +Preprocessor* preprocessor_new(char* src, int include_depth, PpDefines* pp_defines) { + if (include_depth >= 32) { + fatal_error("include depth limit exceeded"); + } + + Preprocessor* pp = calloc(1, sizeof(Preprocessor)); + pp->src = src; + pp->pp_tokens = calloc(1024 * 1024, sizeof(PpToken)); + pp->pp_defines = pp_defines; + pp->include_depth = include_depth; + + return pp; +} + +int find_pp_define(Preprocessor* pp, String* name) { + int i; + for (i = 0; i < pp->pp_defines->len; ++i) { + if (string_equals(&pp->pp_defines->data[i].name, name)) { + return i; + } + } + return -1; +} + +void pp_tokenize_all(Preprocessor* pp) { + char* buf; + int ch; + int start; + while (pp->src[pp->pos]) { + PpToken* tok = pp->pp_tokens + pp->n_pp_tokens; + char c = pp->src[pp->pos]; + ++pp->pos; + if (c == '(') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == ')') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '{') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '}') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '[') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == ']') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == ',') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == ';') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '+') { + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '+') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '|') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '&') { + if (pp->src[pp->pos] == '&') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '-') { + if (pp->src[pp->pos] == '>') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '-') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '*') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '/') { + if (pp->src[pp->pos] == '/') { + start = pp->pos - 1; + ++pp->pos; + while (pp->src[pp->pos] && pp->src[pp->pos] != '\n' && pp->src[pp->pos] != '\r') { + ++pp->pos; + } + tok->kind = PpTokenKind_whitespace; + tok->raw.len = pp->pos - start; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '*') { + fatal_error("unimplemented"); + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '%') { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (c == '.') { + if (pp->src[pp->pos] == '.') { + ++pp->pos; + if (pp->src[pp->pos] == '.') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 3; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + --pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '!') { + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '=') { + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '<') { + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '>') { + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '#') { + if (pp->src[pp->pos] == '#') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '\'') { + start = pp->pos - 1; + if (pp->src[pp->pos] == '\\') { + ++pp->pos; + } + pp->pos += 2; + tok->kind = PpTokenKind_character_constant; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; + } else if (c == '"') { + start = pp->pos - 1; + while (1) { + ch = pp->src[pp->pos]; + if (ch == '\\') { + ++pp->pos; + } else if (ch == '"') { + break; + } + ++pp->pos; + } + ++pp->pos; + tok->kind = PpTokenKind_string_literal; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; + } else if (isdigit(c)) { + --pp->pos; + start = pp->pos; + while (isdigit(pp->src[pp->pos])) { + ++pp->pos; + } + tok->kind = PpTokenKind_pp_number; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; + } else if (isalpha(c) || c == '_') { + --pp->pos; + start = pp->pos; + while (isalnum(pp->src[pp->pos]) || pp->src[pp->pos] == '_') { + ++pp->pos; + } + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; + tok->kind = PpTokenKind_identifier; + } else if (isspace(c)) { + tok->raw.data = pp->src; + tok->raw.len = 1; + tok->kind = PpTokenKind_whitespace; + } else { + tok->raw.data = pp->src; + tok->raw.len = 1; + tok->kind = PpTokenKind_other; + } + ++pp->n_pp_tokens; + } +} + +void process_pp_directives(Preprocessor* pp) { + PpToken* tok = pp->pp_tokens; + PpToken* define_dest; + while (tok->kind != PpTokenKind_eof) { + // TODO: check if the token is at the beginning of line. + // TODO: check if skipped whitespaces do not contain line breaks. + if (tok->kind == PpTokenKind_punctuator && string_equals_cstr(&tok->raw, "#")) { + PpToken* tok2 = tok + 1; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "define")) { + ++tok2; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier) { + PpToken* define_name = tok2; + ++tok2; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier || tok2->kind == PpTokenKind_pp_number) { + define_dest = tok2; + + PpDefine* pp_define = pp->pp_defines->data + pp->pp_defines->len; + pp_define->name.len = define_name->raw.len; + pp_define->name.data = define_name->raw.data; + pp_define->tokens = calloc(1, sizeof(PpToken)); + pp_define->tokens[0].kind = define_dest->kind; + pp_define->tokens[0].raw.len = define_dest->raw.len; + pp_define->tokens[0].raw.data = define_dest->raw.data; + ++pp->pp_defines->len; + } + } + // Remove #define directive. + while (tok != tok2 + 1) { + tok->kind = PpTokenKind_whitespace; + tok->raw.len = 0; + tok->raw.data = NULL; + ++tok; + } + } else if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "include")) { + ++tok2; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_string_literal) { + // Process #include directive. + PpToken* include_name = tok2; + ++tok2; + char* include_name_buf = calloc(include_name->raw.len - 2 + 1, sizeof(char)); + sprintf(include_name_buf, "%.*s", include_name->raw.len - 2, include_name->raw.data + 1); + + // Remove #include directive itself. + while (tok != tok2 + 1) { + tok->kind = PpTokenKind_whitespace; + tok->raw.len = 0; + tok->raw.data = NULL; + ++tok; + } + + // Read and preprocess included file. + FILE* include_file = fopen(include_name_buf, "rb"); + if (include_file == NULL) { + char* buf = calloc(1024, sizeof(char)); + sprintf(buf, "cannot open include file: %s", include_name_buf); + fatal_error(buf); + } + char* include_source = read_all(include_file); + fclose(include_file); + + PpToken* include_pp_tokens = do_preprocess(include_source, pp->include_depth + 1, pp->pp_defines); + + // Insert preprocessed tokens into the current token stream. + int n_include_pp_tokens = 0; + while (include_pp_tokens[n_include_pp_tokens].kind != PpTokenKind_eof) { + ++n_include_pp_tokens; + } + + // Shift existing tokens to make room for include tokens + int n_pp_tokens_after_include = pp->n_pp_tokens - (tok - pp->pp_tokens); + + memmove(tok + n_include_pp_tokens, tok, n_pp_tokens_after_include * sizeof(PpToken)); + + // Copy include tokens into the current position + memcpy(tok, include_pp_tokens, n_include_pp_tokens * sizeof(PpToken)); + + pp->n_pp_tokens += n_include_pp_tokens; + } + } + } else if (tok->kind == PpTokenKind_identifier) { + int pp_define_idx = find_pp_define(pp, &tok->raw); + if (pp_define_idx != -1) { + define_dest = pp->pp_defines->data[pp_define_idx].tokens; + tok->kind = define_dest->kind; + tok->raw.data = define_dest->raw.data; + tok->raw.len = define_dest->raw.len; + } + } + ++tok; + } +} + +PpToken* do_preprocess(char* src, int depth, PpDefines* pp_defines) { + Preprocessor* pp = preprocessor_new(src, depth, pp_defines); + pp_tokenize_all(pp); + process_pp_directives(pp); + return pp->pp_tokens; +} + +PpToken* preprocess(char* src) { + return do_preprocess(src, 0, pp_defines_new()); +} |
