diff options
| author | nsfisis <nsfisis@gmail.com> | 2026-05-03 17:29:12 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2026-05-03 18:42:58 +0900 |
| commit | 3654ce578e6fff53950874adf7e0e4ae0a6eb956 (patch) | |
| tree | 5b6c04273de38dba70b7c25e55da144f5f7c37da /src/cc1/tokenize.c | |
| parent | 1b406b13b03055d2b2d08e8279a4a80c41ca7c20 (diff) | |
| download | ducc-main.tar.gz ducc-main.tar.zst ducc-main.zip | |
Diffstat (limited to 'src/cc1/tokenize.c')
| -rw-r--r-- | src/cc1/tokenize.c | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/src/cc1/tokenize.c b/src/cc1/tokenize.c new file mode 100644 index 0000000..78b1acb --- /dev/null +++ b/src/cc1/tokenize.c @@ -0,0 +1,597 @@ +#include "tokenize.h" +#include <ctype.h> +#include "../lib/common.h" + +typedef struct { + InFile* src; + bool at_bol; + bool expect_header_name; + TokenArray* tokens; +} Lexer; + +static Lexer* lexer_new(InFile* src) { + Lexer* l = calloc(1, sizeof(Lexer)); + + l->src = src; + l->at_bol = true; + l->expect_header_name = false; + l->tokens = calloc(1, sizeof(TokenArray)); + tokens_init(l->tokens, 1024 * 16); + + return l; +} + +static void pplexer_tokenize_pp_directive(Lexer* l, Token* tok) { + // Skip whitespaces after '#'. + char c; + while (isspace((c = infile_peek_char(l->src)))) { + if (c == '\n') + break; + infile_next_char(l->src); + } + // '#' new-line + if (c == '\n') { + tok->kind = TokenKind_pp_directive_nop; + return; + } + + StrBuilder builder; + strbuilder_init(&builder); + while (isalnum(infile_peek_char(l->src)) || infile_peek_char(l->src) == '_') { + strbuilder_append_char(&builder, infile_peek_char(l->src)); + infile_next_char(l->src); + } + const char* pp_directive_name = builder.buf; + + if (builder.len == 0) { + tok->kind = TokenKind_hash; + } else if (strcmp(pp_directive_name, "define") == 0) { + tok->kind = TokenKind_pp_directive_define; + } else if (strcmp(pp_directive_name, "elif") == 0) { + tok->kind = TokenKind_pp_directive_elif; + } else if (strcmp(pp_directive_name, "elifdef") == 0) { + tok->kind = TokenKind_pp_directive_elifdef; + } else if (strcmp(pp_directive_name, "elifndef") == 0) { + tok->kind = TokenKind_pp_directive_elifndef; + } else if (strcmp(pp_directive_name, "else") == 0) { + tok->kind = TokenKind_pp_directive_else; + } else if (strcmp(pp_directive_name, "embed") == 0) { + tok->kind = TokenKind_pp_directive_embed; + } else if (strcmp(pp_directive_name, "endif") == 0) { + tok->kind = TokenKind_pp_directive_endif; + } else if (strcmp(pp_directive_name, "error") == 0) { + tok->kind = TokenKind_pp_directive_error; + } else if (strcmp(pp_directive_name, "if") == 0) { + tok->kind = TokenKind_pp_directive_if; + } else if (strcmp(pp_directive_name, "ifdef") == 0) { + tok->kind = TokenKind_pp_directive_ifdef; + } else if (strcmp(pp_directive_name, "ifndef") == 0) { + tok->kind = TokenKind_pp_directive_ifndef; + } else if (strcmp(pp_directive_name, "include") == 0) { + l->expect_header_name = true; + tok->kind = TokenKind_pp_directive_include; + } else if (strcmp(pp_directive_name, "include_next") == 0) { + l->expect_header_name = true; + tok->kind = TokenKind_pp_directive_include_next; + } else if (strcmp(pp_directive_name, "line") == 0) { + tok->kind = TokenKind_pp_directive_line; + } else if (strcmp(pp_directive_name, "pragma") == 0) { + tok->kind = TokenKind_pp_directive_pragma; + } else if (strcmp(pp_directive_name, "undef") == 0) { + tok->kind = TokenKind_pp_directive_undef; + } else if (strcmp(pp_directive_name, "warning") == 0) { + tok->kind = TokenKind_pp_directive_warning; + } else { + tok->kind = TokenKind_pp_directive_non_directive; + tok->value.string = pp_directive_name; + } +} + +static void do_tokenize_all(Lexer* l) { + while (!infile_eof(l->src)) { + Token* tok = tokens_push_new(l->tokens); + tok->loc = l->src->loc; + char c = infile_peek_char(l->src); + + if (l->expect_header_name && c == '"') { + infile_next_char(l->src); + StrBuilder builder; + strbuilder_init(&builder); + strbuilder_append_char(&builder, '"'); + while (1) { + char ch = infile_peek_char(l->src); + if (ch == '"') + break; + strbuilder_append_char(&builder, ch); + if (ch == '\\') { + infile_next_char(l->src); + strbuilder_append_char(&builder, infile_peek_char(l->src)); + } + infile_next_char(l->src); + } + strbuilder_append_char(&builder, '"'); + infile_next_char(l->src); + tok->kind = TokenKind_header_name; + tok->value.string = builder.buf; + l->expect_header_name = false; + } else if (l->expect_header_name && c == '<') { + infile_next_char(l->src); + StrBuilder builder; + strbuilder_init(&builder); + strbuilder_append_char(&builder, '<'); + while (1) { + char ch = infile_peek_char(l->src); + if (ch == '>') + break; + strbuilder_append_char(&builder, ch); + infile_next_char(l->src); + } + strbuilder_append_char(&builder, '>'); + infile_next_char(l->src); + tok->kind = TokenKind_header_name; + tok->value.string = builder.buf; + l->expect_header_name = false; + } else if (c == '(') { + infile_next_char(l->src); + tok->kind = TokenKind_paren_l; + } else if (c == ')') { + infile_next_char(l->src); + tok->kind = TokenKind_paren_r; + } else if (c == '{') { + infile_next_char(l->src); + tok->kind = TokenKind_brace_l; + } else if (c == '}') { + infile_next_char(l->src); + tok->kind = TokenKind_brace_r; + } else if (c == '[') { + infile_next_char(l->src); + tok->kind = TokenKind_bracket_l; + } else if (c == ']') { + infile_next_char(l->src); + tok->kind = TokenKind_bracket_r; + } else if (c == ',') { + infile_next_char(l->src); + tok->kind = TokenKind_comma; + } else if (c == ':') { + infile_next_char(l->src); + tok->kind = TokenKind_colon; + } else if (c == ';') { + infile_next_char(l->src); + tok->kind = TokenKind_semicolon; + } else if (c == '^') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_xor; + } else { + tok->kind = TokenKind_xor; + } + } else if (c == '?') { + infile_next_char(l->src); + tok->kind = TokenKind_question; + } else if (c == '~') { + infile_next_char(l->src); + tok->kind = TokenKind_tilde; + } else if (c == '+') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_add; + } else if (infile_consume_if(l->src, '+')) { + tok->kind = TokenKind_plusplus; + } else { + tok->kind = TokenKind_plus; + } + } else if (c == '|') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_or; + } else if (infile_consume_if(l->src, '|')) { + tok->kind = TokenKind_oror; + } else { + tok->kind = TokenKind_or; + } + } else if (c == '&') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_and; + } else if (infile_consume_if(l->src, '&')) { + tok->kind = TokenKind_andand; + } else { + tok->kind = TokenKind_and; + } + } else if (c == '-') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '>')) { + tok->kind = TokenKind_arrow; + } else if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_sub; + } else if (infile_consume_if(l->src, '-')) { + tok->kind = TokenKind_minusminus; + } else { + tok->kind = TokenKind_minus; + } + } else if (c == '*') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_mul; + } else { + tok->kind = TokenKind_star; + } + } else if (c == '/') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_div; + } else if (infile_consume_if(l->src, '/')) { + while (!infile_eof(l->src) && infile_peek_char(l->src) != '\n') { + infile_next_char(l->src); + } + tok->kind = TokenKind_whitespace; + } else if (infile_consume_if(l->src, '*')) { + while (infile_peek_char(l->src)) { + if (infile_consume_if(l->src, '*')) { + if (infile_consume_if(l->src, '/')) { + break; + } + continue; + } + infile_next_char(l->src); + } + tok->kind = TokenKind_whitespace; + } else { + tok->kind = TokenKind_slash; + } + } else if (c == '%') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_mod; + } else { + tok->kind = TokenKind_percent; + } + } else if (c == '.') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '.')) { + if (infile_consume_if(l->src, '.')) { + tok->kind = TokenKind_ellipsis; + } else { + tok->kind = TokenKind_other; + tok->value.string = ".."; + } + } else { + tok->kind = TokenKind_dot; + } + } else if (c == '!') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_ne; + } else { + tok->kind = TokenKind_not; + } + } else if (c == '=') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_eq; + } else { + tok->kind = TokenKind_assign; + } + } else if (c == '<') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_le; + } else if (infile_consume_if(l->src, '<')) { + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_lshift; + } else { + tok->kind = TokenKind_lshift; + } + } else { + tok->kind = TokenKind_lt; + } + } else if (c == '>') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_ge; + } else if (infile_consume_if(l->src, '>')) { + if (infile_consume_if(l->src, '=')) { + tok->kind = TokenKind_assign_rshift; + } else { + tok->kind = TokenKind_rshift; + } + } else { + tok->kind = TokenKind_gt; + } + } else if (c == '#') { + infile_next_char(l->src); + if (infile_consume_if(l->src, '#')) { + tok->kind = TokenKind_hashhash; + } else { + if (l->at_bol) { + pplexer_tokenize_pp_directive(l, tok); + } else { + tok->kind = TokenKind_hash; + } + } + } else if (c == '\'') { + infile_next_char(l->src); + StrBuilder builder; + strbuilder_init(&builder); + strbuilder_append_char(&builder, '\''); + strbuilder_append_char(&builder, infile_peek_char(l->src)); + if (infile_peek_char(l->src) == '\\') { + infile_next_char(l->src); + strbuilder_append_char(&builder, infile_peek_char(l->src)); + } + strbuilder_append_char(&builder, '\''); + infile_next_char(l->src); + infile_next_char(l->src); + tok->kind = TokenKind_character_constant; + tok->value.string = builder.buf; + } else if (c == '"') { + infile_next_char(l->src); + StrBuilder builder; + strbuilder_init(&builder); + while (1) { + char ch = infile_peek_char(l->src); + if (ch == '"') + break; + strbuilder_append_char(&builder, ch); + if (ch == '\\') { + infile_next_char(l->src); + strbuilder_append_char(&builder, infile_peek_char(l->src)); + } + infile_next_char(l->src); + } + infile_next_char(l->src); + tok->kind = TokenKind_literal_str; + tok->value.string = builder.buf; + } else if (isdigit(c)) { + // TODO: implement tokenization of pp-number. + StrBuilder builder; + strbuilder_init(&builder); + while (isalnum(infile_peek_char(l->src))) { + strbuilder_append_char(&builder, infile_peek_char(l->src)); + infile_next_char(l->src); + } + if (infile_peek_char(l->src) == '.' && isdigit(infile_peek_char2(l->src))) { + strbuilder_append_char(&builder, infile_peek_char(l->src)); + infile_next_char(l->src); + while (isdigit(infile_peek_char(l->src))) { + strbuilder_append_char(&builder, infile_peek_char(l->src)); + infile_next_char(l->src); + } + tok->kind = TokenKind_literal_double; + tok->value.floating = strtod(builder.buf, NULL); + } else { + tok->kind = TokenKind_literal_int; + tok->value.integer = strtol(builder.buf, NULL, 0); + } + } else if (isalpha(c) || c == '_') { + StrBuilder builder; + strbuilder_init(&builder); + while (isalnum(infile_peek_char(l->src)) || infile_peek_char(l->src) == '_') { + strbuilder_append_char(&builder, infile_peek_char(l->src)); + infile_next_char(l->src); + } + tok->kind = TokenKind_ident; + tok->value.string = builder.buf; + } else if (c == '\n') { + infile_next_char(l->src); + tok->kind = TokenKind_newline; + + // Reset expect_header_name at the end of line. It handles cases like: + // + // #ifdef ADDITIONAL_HEADER + // #include ADDITIONAL_HEADER + // #endif + // + // Even if ADDITIONAL_HEADER is undefined, this include directive line is tokenized. If the flag were not + // reset, the next occurrence of '<' or '"' would be recognized as part of a header name. + l->expect_header_name = false; + } else if (isspace(c)) { + while (isspace((c = infile_peek_char(l->src)))) { + if (c == '\n') + break; + infile_next_char(l->src); + } + if (l->at_bol && infile_peek_char(l->src) == '#') { + infile_next_char(l->src); + pplexer_tokenize_pp_directive(l, tok); + } else { + tok->kind = TokenKind_whitespace; + } + } else { + infile_next_char(l->src); + tok->kind = TokenKind_other; + char* buf = calloc(2, sizeof(char)); + buf[0] = c; + tok->value.string = buf; + } + l->at_bol = tok->kind == TokenKind_newline; + } + Token* eof_tok = tokens_push_new(l->tokens); + eof_tok->loc = l->src->loc; + eof_tok->kind = TokenKind_eof; +} + +TokenArray* tokenize(InFile* src) { + Lexer* l = lexer_new(src); + do_tokenize_all(l); + return l->tokens; +} + +TokenArray* convert_pp_tokens_to_tokens(TokenArray* pp_tokens) { + TokenArray* tokens = calloc(1, sizeof(TokenArray)); + // tokens need not store whitespace tokens. + tokens_init(tokens, pp_tokens->len / 2); + + for (size_t pos = 0; pos < pp_tokens->len; ++pos) { + Token* pp_tok = &pp_tokens->data[pos]; + TokenKind k = pp_tok->kind; + if (k == TokenKind_removed || k == TokenKind_whitespace || k == TokenKind_newline) { + continue; + } + Token* tok = tokens_push_new(tokens); + tok->loc = pp_tok->loc; + if (k == TokenKind_character_constant) { + tok->kind = TokenKind_literal_int; + int ch = pp_tok->value.string[1]; + if (ch == '\\') { + ch = pp_tok->value.string[2]; + if (ch == 'a') { + ch = '\a'; + } else if (ch == 'b') { + ch = '\b'; + } else if (ch == 'f') { + ch = '\f'; + } else if (ch == 'n') { + ch = '\n'; + } else if (ch == 'r') { + ch = '\r'; + } else if (ch == 't') { + ch = '\t'; + } else if (ch == 'v') { + ch = '\v'; + } else if (ch == '0') { + ch = '\0'; + } else if (ch == 'e') { + // \e is not a part of Standard C, but commonly supported. + ch = 27; + } + } + tok->value.integer = ch; + } else if (k == TokenKind_literal_str) { + tok->kind = pp_tok->kind; + + size_t len = strlen(pp_tok->value.string); + char* buf = calloc(len + 1, sizeof(char)); + for (size_t i = 0, j = 0; i < len; i++, j++) { + if (pp_tok->value.string[i] == '\\' && pp_tok->value.string[i + 1] == 'e') { + // \e is not a part of Standard C, but commonly supported. + buf[j] = 033; + i++; + } else { + buf[j] = pp_tok->value.string[i]; + } + } + tok->value.string = buf; + } else if (k == TokenKind_ident) { + if (strcmp(pp_tok->value.string, "alignas") == 0) { + tok->kind = TokenKind_keyword_alignas; + } else if (strcmp(pp_tok->value.string, "alignof") == 0) { + tok->kind = TokenKind_keyword_alignof; + } else if (strcmp(pp_tok->value.string, "auto") == 0) { + tok->kind = TokenKind_keyword_auto; + } else if (strcmp(pp_tok->value.string, "bool") == 0) { + tok->kind = TokenKind_keyword_bool; + } else if (strcmp(pp_tok->value.string, "break") == 0) { + tok->kind = TokenKind_keyword_break; + } else if (strcmp(pp_tok->value.string, "case") == 0) { + tok->kind = TokenKind_keyword_case; + } else if (strcmp(pp_tok->value.string, "char") == 0) { + tok->kind = TokenKind_keyword_char; + } else if (strcmp(pp_tok->value.string, "const") == 0) { + tok->kind = TokenKind_keyword_const; + } else if (strcmp(pp_tok->value.string, "constexpr") == 0) { + tok->kind = TokenKind_keyword_constexpr; + } else if (strcmp(pp_tok->value.string, "continue") == 0) { + tok->kind = TokenKind_keyword_continue; + } else if (strcmp(pp_tok->value.string, "default") == 0) { + tok->kind = TokenKind_keyword_default; + } else if (strcmp(pp_tok->value.string, "do") == 0) { + tok->kind = TokenKind_keyword_do; + } else if (strcmp(pp_tok->value.string, "double") == 0) { + tok->kind = TokenKind_keyword_double; + } else if (strcmp(pp_tok->value.string, "else") == 0) { + tok->kind = TokenKind_keyword_else; + } else if (strcmp(pp_tok->value.string, "enum") == 0) { + tok->kind = TokenKind_keyword_enum; + } else if (strcmp(pp_tok->value.string, "extern") == 0) { + tok->kind = TokenKind_keyword_extern; + } else if (strcmp(pp_tok->value.string, "false") == 0) { + tok->kind = TokenKind_keyword_false; + } else if (strcmp(pp_tok->value.string, "float") == 0) { + tok->kind = TokenKind_keyword_float; + } else if (strcmp(pp_tok->value.string, "for") == 0) { + tok->kind = TokenKind_keyword_for; + } else if (strcmp(pp_tok->value.string, "goto") == 0) { + tok->kind = TokenKind_keyword_goto; + } else if (strcmp(pp_tok->value.string, "if") == 0) { + tok->kind = TokenKind_keyword_if; + } else if (strcmp(pp_tok->value.string, "inline") == 0) { + tok->kind = TokenKind_keyword_inline; + } else if (strcmp(pp_tok->value.string, "int") == 0) { + tok->kind = TokenKind_keyword_int; + } else if (strcmp(pp_tok->value.string, "long") == 0) { + tok->kind = TokenKind_keyword_long; + } else if (strcmp(pp_tok->value.string, "nullptr") == 0) { + tok->kind = TokenKind_keyword_nullptr; + } else if (strcmp(pp_tok->value.string, "register") == 0) { + tok->kind = TokenKind_keyword_register; + } else if (strcmp(pp_tok->value.string, "restrict") == 0) { + tok->kind = TokenKind_keyword_restrict; + } else if (strcmp(pp_tok->value.string, "return") == 0) { + tok->kind = TokenKind_keyword_return; + } else if (strcmp(pp_tok->value.string, "short") == 0) { + tok->kind = TokenKind_keyword_short; + } else if (strcmp(pp_tok->value.string, "signed") == 0) { + tok->kind = TokenKind_keyword_signed; + } else if (strcmp(pp_tok->value.string, "sizeof") == 0) { + tok->kind = TokenKind_keyword_sizeof; + } else if (strcmp(pp_tok->value.string, "static") == 0) { + tok->kind = TokenKind_keyword_static; + } else if (strcmp(pp_tok->value.string, "static_assert") == 0) { + tok->kind = TokenKind_keyword_static_assert; + } else if (strcmp(pp_tok->value.string, "struct") == 0) { + tok->kind = TokenKind_keyword_struct; + } else if (strcmp(pp_tok->value.string, "switch") == 0) { + tok->kind = TokenKind_keyword_switch; + } else if (strcmp(pp_tok->value.string, "thread_local") == 0) { + tok->kind = TokenKind_keyword_thread_local; + } else if (strcmp(pp_tok->value.string, "true") == 0) { + tok->kind = TokenKind_keyword_true; + } else if (strcmp(pp_tok->value.string, "typedef") == 0) { + tok->kind = TokenKind_keyword_typedef; + } else if (strcmp(pp_tok->value.string, "typeof") == 0) { + tok->kind = TokenKind_keyword_typeof; + } else if (strcmp(pp_tok->value.string, "typeof_unqual") == 0) { + tok->kind = TokenKind_keyword_typeof_unqual; + } else if (strcmp(pp_tok->value.string, "union") == 0) { + tok->kind = TokenKind_keyword_union; + } else if (strcmp(pp_tok->value.string, "unsigned") == 0) { + tok->kind = TokenKind_keyword_unsigned; + } else if (strcmp(pp_tok->value.string, "void") == 0) { + tok->kind = TokenKind_keyword_void; + } else if (strcmp(pp_tok->value.string, "volatile") == 0) { + tok->kind = TokenKind_keyword_volatile; + } else if (strcmp(pp_tok->value.string, "while") == 0) { + tok->kind = TokenKind_keyword_while; + } else if (strcmp(pp_tok->value.string, "_Atomic") == 0) { + tok->kind = TokenKind_keyword__Atomic; + } else if (strcmp(pp_tok->value.string, "_BitInt") == 0) { + tok->kind = TokenKind_keyword__BitInt; + } else if (strcmp(pp_tok->value.string, "_Complex") == 0) { + tok->kind = TokenKind_keyword__Complex; + } else if (strcmp(pp_tok->value.string, "_Decimal128") == 0) { + tok->kind = TokenKind_keyword__Decimal128; + } else if (strcmp(pp_tok->value.string, "_Decimal32") == 0) { + tok->kind = TokenKind_keyword__Decimal32; + } else if (strcmp(pp_tok->value.string, "_Decimal64") == 0) { + tok->kind = TokenKind_keyword__Decimal64; + } else if (strcmp(pp_tok->value.string, "_Generic") == 0) { + tok->kind = TokenKind_keyword__Generic; + } else if (strcmp(pp_tok->value.string, "_Imaginary") == 0) { + tok->kind = TokenKind_keyword__Imaginary; + } else if (strcmp(pp_tok->value.string, "_Noreturn") == 0) { + tok->kind = TokenKind_keyword__Noreturn; + } else { + tok->kind = TokenKind_ident; + tok->value = pp_tok->value; + } + } else if (k == TokenKind_other) { + unreachable(); + } else { + tok->kind = pp_tok->kind; + tok->value = pp_tok->value; + } + } + + return tokens; +} |
