diff options
| author | nsfisis <nsfisis@gmail.com> | 2025-06-21 03:16:32 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2025-08-15 10:04:06 +0900 |
| commit | 64a77c44a5316c19bbc2486e462095f36c718314 (patch) | |
| tree | e1bb33c457a8170520b89a90a379cbb2b97b4537 | |
| parent | 45c4953d796cecf1f4c81b52f95dc562aaffbd0f (diff) | |
| download | ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.gz ducc-64a77c44a5316c19bbc2486e462095f36c718314.tar.zst ducc-64a77c44a5316c19bbc2486e462095f36c718314.zip | |
refactor: separate tokenize() into preprocessing and tokenizing
| -rw-r--r-- | main.c | 708 |
1 files changed, 476 insertions, 232 deletions
@@ -66,350 +66,593 @@ int string_equals(const String* s1, const String* s2) { return s1->len == s2->len && strncmp(s1->data, s2->data, s1->len) == 0; } -enum TokenKind { - TokenKind_eof, - - TokenKind_and, - TokenKind_andand, - TokenKind_arrow, - TokenKind_assign, - TokenKind_assign_add, - TokenKind_assign_sub, - TokenKind_brace_l, - TokenKind_brace_r, - TokenKind_bracket_l, - TokenKind_bracket_r, - TokenKind_comma, - TokenKind_dot, - TokenKind_ellipsis, - TokenKind_eq, - TokenKind_ge, - TokenKind_gt, - TokenKind_ident, - TokenKind_keyword_break, - TokenKind_keyword_char, - TokenKind_keyword_const, - TokenKind_keyword_continue, - TokenKind_keyword_do, - TokenKind_keyword_else, - TokenKind_keyword_enum, - TokenKind_keyword_extern, - TokenKind_keyword_for, - TokenKind_keyword_if, - TokenKind_keyword_int, - TokenKind_keyword_long, - TokenKind_keyword_return, - TokenKind_keyword_sizeof, - TokenKind_keyword_struct, - TokenKind_keyword_typeof, - TokenKind_keyword_void, - TokenKind_keyword_while, - TokenKind_le, - TokenKind_lt, - TokenKind_literal_int, - TokenKind_literal_str, - TokenKind_minus, - TokenKind_minusminus, - TokenKind_ne, - TokenKind_not, - TokenKind_oror, - TokenKind_paren_l, - TokenKind_paren_r, - TokenKind_percent, - TokenKind_plus, - TokenKind_plusplus, - TokenKind_semicolon, - TokenKind_slash, - TokenKind_star, +int string_equals_cstr(const String* s1, const char* s2) { + size_t s2_len = strlen(s2); + return s1->len == s2_len && strncmp(s1->data, s2, s1->len) == 0; +} + +enum PpTokenKind { + PpTokenKind_eof, + + PpTokenKind_header_name, + PpTokenKind_identifier, + PpTokenKind_pp_number, + PpTokenKind_character_constant, + PpTokenKind_string_literal, + PpTokenKind_punctuator, + PpTokenKind_other, + PpTokenKind_whitespace, }; -typedef enum TokenKind TokenKind; +typedef enum PpTokenKind PpTokenKind; -struct Token { - TokenKind kind; +struct PpToken { + PpTokenKind kind; String raw; }; -typedef struct Token Token; +typedef struct PpToken PpToken; struct PpDefine { String name; - Token* tokens; + PpToken* tokens; }; typedef struct PpDefine PpDefine; -struct Lexer { +struct Preprocessor { char* src; int pos; - Token* tokens; - int n_tokens; + PpToken* pp_tokens; + int n_pp_tokens; PpDefine* pp_defines; int n_pp_defines; }; -typedef struct Lexer Lexer; +typedef struct Preprocessor Preprocessor; -Lexer* lexer_new(char* src) { - Lexer* l = calloc(1, sizeof(Lexer)); - l->src = src; - l->tokens = calloc(1024 * 1024, sizeof(Token)); - l->pp_defines = calloc(1024, sizeof(PpDefine)); - return l; +Preprocessor* preprocessor_new(char* src) { + Preprocessor* pp = calloc(1, sizeof(Preprocessor)); + pp->src = src; + pp->pp_tokens = calloc(1024 * 1024, sizeof(PpToken)); + pp->pp_defines = calloc(1024, sizeof(PpDefine)); + return pp; } -int find_pp_define(Lexer* l, String* name) { +int find_pp_define(Preprocessor* pp, String* name) { int i; - for (i = 0; i < l->n_pp_defines; ++i) { - if (string_equals(&l->pp_defines[i].name, name)) { + for (i = 0; i < pp->n_pp_defines; ++i) { + if (string_equals(&pp->pp_defines[i].name, name)) { return i; } } return -1; } -void tokenize_all(Lexer* l) { +void pp_tokenize_all(Preprocessor* pp) { char* buf; int ch; int start; - while (l->src[l->pos]) { - Token* tok = l->tokens + l->n_tokens; - char c = l->src[l->pos]; - ++l->pos; + while (pp->src[pp->pos]) { + PpToken* tok = pp->pp_tokens + pp->n_pp_tokens; + char c = pp->src[pp->pos]; + ++pp->pos; if (c == '(') { - tok->kind = TokenKind_paren_l; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == ')') { - tok->kind = TokenKind_paren_r; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '{') { - tok->kind = TokenKind_brace_l; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '}') { - tok->kind = TokenKind_brace_r; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '[') { - tok->kind = TokenKind_bracket_l; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == ']') { - tok->kind = TokenKind_bracket_r; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == ',') { - tok->kind = TokenKind_comma; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == ';') { - tok->kind = TokenKind_semicolon; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '+') { - if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_assign_add; - } else if (l->src[l->pos] == '+') { - ++l->pos; - tok->kind = TokenKind_plusplus; + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '+') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_plus; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '|') { - ++l->pos; - tok->kind = TokenKind_oror; + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '&') { - if (l->src[l->pos] == '&') { - ++l->pos; - tok->kind = TokenKind_andand; + if (pp->src[pp->pos] == '&') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_and; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '-') { - if (l->src[l->pos] == '>') { - ++l->pos; - tok->kind = TokenKind_arrow; - } else if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_assign_sub; - } else if (l->src[l->pos] == '-') { - ++l->pos; - tok->kind = TokenKind_minusminus; + if (pp->src[pp->pos] == '>') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else if (pp->src[pp->pos] == '-') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_minus; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '*') { - tok->kind = TokenKind_star; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '/') { - tok->kind = TokenKind_slash; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '%') { - tok->kind = TokenKind_percent; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else if (c == '.') { - if (l->src[l->pos] == '.') { - ++l->pos; - if (l->src[l->pos] == '.') { - ++l->pos; - tok->kind = TokenKind_ellipsis; + if (pp->src[pp->pos] == '.') { + ++pp->pos; + if (pp->src[pp->pos] == '.') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 3; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - fatal_error("unknown token: .."); + --pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else { - tok->kind = TokenKind_dot; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '!') { - if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_ne; + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_not; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '=') { - if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_eq; + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_assign; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '<') { - if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_le; + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_lt; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '>') { - if (l->src[l->pos] == '=') { - ++l->pos; - tok->kind = TokenKind_ge; + if (pp->src[pp->pos] == '=') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } else { - tok->kind = TokenKind_gt; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } + } else if (c == '#') { + if (pp->src[pp->pos] == '#') { + ++pp->pos; + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 2; + tok->raw.data = pp->src + pp->pos - tok->raw.len; + } else { + tok->kind = PpTokenKind_punctuator; + tok->raw.len = 1; + tok->raw.data = pp->src + pp->pos - tok->raw.len; } } else if (c == '\'') { - ch = l->src[l->pos]; + start = pp->pos - 1; + ch = pp->src[pp->pos]; if (ch == '\\') { - ++l->pos; - ch = l->src[l->pos]; + ++pp->pos; + ch = pp->src[pp->pos]; if (ch == 'n') { ch = '\n'; } } - l->pos += 2; - tok->kind = TokenKind_literal_int; - buf = calloc(4, sizeof(char)); - sprintf(buf, "%d", ch); - tok->raw.data = buf; - tok->raw.len = strlen(buf); + pp->pos += 2; + tok->kind = PpTokenKind_character_constant; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; } else if (c == '"') { - start = l->pos; + start = pp->pos - 1; while (1) { - ch = l->src[l->pos]; + ch = pp->src[pp->pos]; if (ch == '\\') { - ++l->pos; + ++pp->pos; } else if (ch == '"') { break; } - ++l->pos; + ++pp->pos; } - tok->kind = TokenKind_literal_str; - tok->raw.data = l->src + start; - tok->raw.len = l->pos - start; - ++l->pos; + ++pp->pos; + tok->kind = PpTokenKind_string_literal; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; } else if (isdigit(c)) { - --l->pos; - start = l->pos; - while (isdigit(l->src[l->pos])) { - ++l->pos; + --pp->pos; + start = pp->pos; + while (isdigit(pp->src[pp->pos])) { + ++pp->pos; } - tok->kind = TokenKind_literal_int; - tok->raw.data = l->src + start; - tok->raw.len = l->pos - start; + tok->kind = PpTokenKind_pp_number; + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; } else if (isalpha(c) || c == '_') { - --l->pos; - start = l->pos; - while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') { - ++l->pos; + --pp->pos; + start = pp->pos; + while (isalnum(pp->src[pp->pos]) || pp->src[pp->pos] == '_') { + ++pp->pos; + } + tok->raw.data = pp->src + start; + tok->raw.len = pp->pos - start; + tok->kind = PpTokenKind_identifier; + } else if (isspace(c)) { + tok->raw.data = pp->src; + tok->raw.len = 1; + tok->kind = PpTokenKind_whitespace; + } else { + tok->raw.data = pp->src; + tok->raw.len = 1; + tok->kind = PpTokenKind_other; + } + ++pp->n_pp_tokens; + } +} + +void pp_execute_pp_directive(Preprocessor* pp) { + PpToken* tok = pp->pp_tokens; + PpToken* define_dest; + while (tok->kind != PpTokenKind_eof) { + "TODO: check if the token is at the beginning of line."; + "TODO: check if skipped whitespaces do not contain line breaks."; + if (tok->kind == PpTokenKind_punctuator && string_equals_cstr(&tok->raw, "#")) { + PpToken* tok2 = tok + 1; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "define")) { + ++tok2; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier) { + PpToken* define_name = tok2; + ++tok2; + while (tok2->kind != PpTokenKind_eof && tok2->kind == PpTokenKind_whitespace) + ++tok2; + if (tok2->kind == PpTokenKind_identifier || tok2->kind == PpTokenKind_pp_number) { + define_dest = tok2; + + pp->pp_defines[pp->n_pp_defines].name.len = define_name->raw.len; + pp->pp_defines[pp->n_pp_defines].name.data = define_name->raw.data; + pp->pp_defines[pp->n_pp_defines].tokens = calloc(1, sizeof(PpToken)); + pp->pp_defines[pp->n_pp_defines].tokens[0].kind = define_dest->kind; + pp->pp_defines[pp->n_pp_defines].tokens[0].raw.len = define_dest->raw.len; + pp->pp_defines[pp->n_pp_defines].tokens[0].raw.data = define_dest->raw.data; + ++pp->n_pp_defines; + } + } + } + while (tok != tok2 + 1) { + tok->kind = PpTokenKind_whitespace; + tok->raw.len = 0; + tok->raw.data = NULL; + ++tok; } - int ident_len = l->pos - start; - if (ident_len == 5 && strstr(l->src + start, "break") == l->src + start) { + } else if (tok->kind == PpTokenKind_identifier) { + int pp_define_idx = find_pp_define(pp, &tok->raw); + if (pp_define_idx != -1) { + define_dest = pp->pp_defines[pp_define_idx].tokens; + tok->kind = define_dest->kind; + tok->raw.data = define_dest->raw.data; + tok->raw.len = define_dest->raw.len; + } + } + ++tok; + } +} + +PpToken* preprocess(char* src) { + Preprocessor* pp = preprocessor_new(src); + pp_tokenize_all(pp); + pp_execute_pp_directive(pp); + return pp->pp_tokens; +} + +enum TokenKind { + TokenKind_eof, + + TokenKind_and, + TokenKind_andand, + TokenKind_arrow, + TokenKind_assign, + TokenKind_assign_add, + TokenKind_assign_sub, + TokenKind_brace_l, + TokenKind_brace_r, + TokenKind_bracket_l, + TokenKind_bracket_r, + TokenKind_comma, + TokenKind_dot, + TokenKind_ellipsis, + TokenKind_eq, + TokenKind_ge, + TokenKind_gt, + TokenKind_ident, + TokenKind_keyword_break, + TokenKind_keyword_char, + TokenKind_keyword_const, + TokenKind_keyword_continue, + TokenKind_keyword_do, + TokenKind_keyword_else, + TokenKind_keyword_enum, + TokenKind_keyword_extern, + TokenKind_keyword_for, + TokenKind_keyword_if, + TokenKind_keyword_int, + TokenKind_keyword_long, + TokenKind_keyword_return, + TokenKind_keyword_sizeof, + TokenKind_keyword_struct, + TokenKind_keyword_typeof, + TokenKind_keyword_void, + TokenKind_keyword_while, + TokenKind_le, + TokenKind_lt, + TokenKind_literal_int, + TokenKind_literal_str, + TokenKind_minus, + TokenKind_minusminus, + TokenKind_ne, + TokenKind_not, + TokenKind_oror, + TokenKind_paren_l, + TokenKind_paren_r, + TokenKind_percent, + TokenKind_plus, + TokenKind_plusplus, + TokenKind_semicolon, + TokenKind_slash, + TokenKind_star, +}; +typedef enum TokenKind TokenKind; + +struct Token { + TokenKind kind; + String raw; +}; +typedef struct Token Token; + +struct Lexer { + PpToken* src; + int pos; + Token* tokens; + int n_tokens; +}; +typedef struct Lexer Lexer; + +Lexer* lexer_new(PpToken* pp_tokens) { + Lexer* l = calloc(1, sizeof(Lexer)); + l->src = pp_tokens; + l->tokens = calloc(1024 * 1024, sizeof(Token)); + return l; +} + +void tokenize_all(Lexer* l) { + char* buf; + int ch; + int start; + while (l->src[l->pos].kind != PpTokenKind_eof) { + PpToken* pp_tok = l->src + l->pos; + Token* tok = l->tokens + l->n_tokens; + PpTokenKind k = pp_tok->kind; + ++l->pos; + if (k == PpTokenKind_header_name) { + fatal_error("not implemented yet"); + } else if (k == PpTokenKind_identifier) { + if (string_equals_cstr(&pp_tok->raw, "break")) { tok->kind = TokenKind_keyword_break; - } else if (ident_len == 4 && strstr(l->src + start, "char") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "char")) { tok->kind = TokenKind_keyword_char; - } else if (ident_len == 5 && strstr(l->src + start, "const") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "const")) { tok->kind = TokenKind_keyword_const; - } else if (ident_len == 8 && strstr(l->src + start, "continue") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "continue")) { tok->kind = TokenKind_keyword_continue; - } else if (ident_len == 2 && strstr(l->src + start, "do") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "do")) { tok->kind = TokenKind_keyword_do; - } else if (ident_len == 4 && strstr(l->src + start, "else") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "else")) { tok->kind = TokenKind_keyword_else; - } else if (ident_len == 4 && strstr(l->src + start, "enum") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "enum")) { tok->kind = TokenKind_keyword_enum; - } else if (ident_len == 6 && strstr(l->src + start, "extern") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "extern")) { tok->kind = TokenKind_keyword_extern; - } else if (ident_len == 3 && strstr(l->src + start, "for") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "for")) { tok->kind = TokenKind_keyword_for; - } else if (ident_len == 2 && strstr(l->src + start, "if") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "if")) { tok->kind = TokenKind_keyword_if; - } else if (ident_len == 3 && strstr(l->src + start, "int") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "int")) { tok->kind = TokenKind_keyword_int; - } else if (ident_len == 4 && strstr(l->src + start, "long") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "long")) { tok->kind = TokenKind_keyword_long; - } else if (ident_len == 6 && strstr(l->src + start, "return") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "return")) { tok->kind = TokenKind_keyword_return; - } else if (ident_len == 6 && strstr(l->src + start, "sizeof") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "sizeof")) { tok->kind = TokenKind_keyword_sizeof; - } else if (ident_len == 6 && strstr(l->src + start, "struct") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "struct")) { tok->kind = TokenKind_keyword_struct; - } else if (ident_len == 7 && strstr(l->src + start, "typedef") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "typedef")) { tok->kind = TokenKind_keyword_typeof; - } else if (ident_len == 4 && strstr(l->src + start, "void") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "void")) { tok->kind = TokenKind_keyword_void; - } else if (ident_len == 5 && strstr(l->src + start, "while") == l->src + start) { + } else if (string_equals_cstr(&pp_tok->raw, "while")) { tok->kind = TokenKind_keyword_while; } else { - tok->raw.data = l->src + start; - tok->raw.len = ident_len; - int pp_define_idx = find_pp_define(l, &tok->raw); - if (pp_define_idx == -1) { - tok->kind = TokenKind_ident; - } else { - tok->kind = l->pp_defines[pp_define_idx].tokens->kind; - tok->raw.data = l->pp_defines[pp_define_idx].tokens->raw.data; - tok->raw.len = l->pp_defines[pp_define_idx].tokens->raw.len; - } - } - } else if (isspace(c)) { - continue; - } else if (c == '#') { - l->pos += 6; - while (isspace(l->src[l->pos])) { - ++l->pos; + tok->kind = TokenKind_ident; } - start = l->pos; - while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') { - ++l->pos; - } - PpDefine* def = l->pp_defines + l->n_pp_defines; - ++l->n_pp_defines; - def->name.data = l->src + start; - def->name.len = l->pos - start; - while (isspace(l->src[l->pos])) { - ++l->pos; - } - int start2 = l->pos; - int is_digit = isdigit(l->src[l->pos]); - if (is_digit) { - while (isdigit(l->src[l->pos])) { - ++l->pos; - } - } else { - while (isalnum(l->src[l->pos]) || l->src[l->pos] == '_') { - ++l->pos; + tok->raw.data = pp_tok->raw.data; + tok->raw.len = pp_tok->raw.len; + } else if (k == PpTokenKind_pp_number) { + tok->kind = TokenKind_literal_int; + tok->raw.data = pp_tok->raw.data; + tok->raw.len = pp_tok->raw.len; + } else if (k == PpTokenKind_character_constant) { + tok->kind = TokenKind_literal_int; + ch = pp_tok->raw.data[1]; + if (ch == '\\') { + ch = pp_tok->raw.data[2]; + if (ch == 'n') { + ch = '\n'; } } - def->tokens = calloc(1, sizeof(Token)); - if (is_digit) { - def->tokens->kind = TokenKind_literal_int; + buf = calloc(4, sizeof(char)); + sprintf(buf, "%d", ch); + tok->raw.data = buf; + tok->raw.len = strlen(buf); + } else if (k == PpTokenKind_string_literal) { + tok->kind = TokenKind_literal_str; + tok->raw.data = pp_tok->raw.data + 1; + tok->raw.len = pp_tok->raw.len - 2; + } else if (k == PpTokenKind_punctuator || k == PpTokenKind_other) { + if (string_equals_cstr(&pp_tok->raw, "(")) { + tok->kind = TokenKind_paren_l; + } else if (string_equals_cstr(&pp_tok->raw, ")")) { + tok->kind = TokenKind_paren_r; + } else if (string_equals_cstr(&pp_tok->raw, "{")) { + tok->kind = TokenKind_brace_l; + } else if (string_equals_cstr(&pp_tok->raw, "}")) { + tok->kind = TokenKind_brace_r; + } else if (string_equals_cstr(&pp_tok->raw, "[")) { + tok->kind = TokenKind_bracket_l; + } else if (string_equals_cstr(&pp_tok->raw, "]")) { + tok->kind = TokenKind_bracket_r; + } else if (string_equals_cstr(&pp_tok->raw, ",")) { + tok->kind = TokenKind_comma; + } else if (string_equals_cstr(&pp_tok->raw, ";")) { + tok->kind = TokenKind_semicolon; + } else if (string_equals_cstr(&pp_tok->raw, "+=")) { + tok->kind = TokenKind_assign_add; + } else if (string_equals_cstr(&pp_tok->raw, "++")) { + tok->kind = TokenKind_plusplus; + } else if (string_equals_cstr(&pp_tok->raw, "+")) { + tok->kind = TokenKind_plus; + } else if (string_equals_cstr(&pp_tok->raw, "||")) { + tok->kind = TokenKind_oror; + } else if (string_equals_cstr(&pp_tok->raw, "&&")) { + tok->kind = TokenKind_andand; + } else if (string_equals_cstr(&pp_tok->raw, "&")) { + tok->kind = TokenKind_and; + } else if (string_equals_cstr(&pp_tok->raw, "->")) { + tok->kind = TokenKind_arrow; + } else if (string_equals_cstr(&pp_tok->raw, "-=")) { + tok->kind = TokenKind_assign_sub; + } else if (string_equals_cstr(&pp_tok->raw, "--")) { + tok->kind = TokenKind_minusminus; + } else if (string_equals_cstr(&pp_tok->raw, "-")) { + tok->kind = TokenKind_minus; + } else if (string_equals_cstr(&pp_tok->raw, "*")) { + tok->kind = TokenKind_star; + } else if (string_equals_cstr(&pp_tok->raw, "/")) { + tok->kind = TokenKind_slash; + } else if (string_equals_cstr(&pp_tok->raw, "%")) { + tok->kind = TokenKind_percent; + } else if (string_equals_cstr(&pp_tok->raw, "...")) { + tok->kind = TokenKind_ellipsis; + } else if (string_equals_cstr(&pp_tok->raw, ".")) { + tok->kind = TokenKind_dot; + } else if (string_equals_cstr(&pp_tok->raw, "!=")) { + tok->kind = TokenKind_ne; + } else if (string_equals_cstr(&pp_tok->raw, "!")) { + tok->kind = TokenKind_not; + } else if (string_equals_cstr(&pp_tok->raw, "==")) { + tok->kind = TokenKind_eq; + } else if (string_equals_cstr(&pp_tok->raw, "=")) { + tok->kind = TokenKind_assign; + } else if (string_equals_cstr(&pp_tok->raw, "<=")) { + tok->kind = TokenKind_le; + } else if (string_equals_cstr(&pp_tok->raw, "<")) { + tok->kind = TokenKind_lt; + } else if (string_equals_cstr(&pp_tok->raw, ">=")) { + tok->kind = TokenKind_ge; + } else if (string_equals_cstr(&pp_tok->raw, ">")) { + tok->kind = TokenKind_gt; } else { - def->tokens->kind = TokenKind_ident; + sprintf(buf, "unknown token: %.*s", pp_tok->raw.len, pp_tok->raw.data); + fatal_error(buf); } - def->tokens->raw.data = l->src + start2; - def->tokens->raw.len = l->pos - start2; + tok->raw.data = pp_tok->raw.data; + tok->raw.len = pp_tok->raw.len; + } else if (k == PpTokenKind_whitespace) { continue; - } else { - buf = calloc(1024, sizeof(char)); - sprintf(buf, "unknown token char(%d)", c); - fatal_error(buf); } ++l->n_tokens; } } -Token* tokenize(char* src) { - Lexer* l = lexer_new(src); +Token* tokenize(PpToken* pp_tokens) { + Lexer* l = lexer_new(pp_tokens); tokenize_all(l); return l->tokens; } @@ -2223,7 +2466,8 @@ int main(int argc, char** argv) { in = fopen(argv[1], "rb"); } char* source = read_all(in); - Token* tokens = tokenize(source); + PpToken* pp_tokens = preprocess(source); + Token* tokens = tokenize(pp_tokens); Program* prog = parse(tokens); analyze(prog); codegen(prog); |
