diff options
Diffstat (limited to 'tokenize.c')
| -rw-r--r-- | tokenize.c | 340 |
1 files changed, 14 insertions, 326 deletions
@@ -1,210 +1,12 @@ -enum TokenKind { - TokenKind_eof, - - TokenKind_and, - TokenKind_andand, - TokenKind_arrow, - TokenKind_assign, - TokenKind_assign_add, - TokenKind_assign_sub, - TokenKind_brace_l, - TokenKind_brace_r, - TokenKind_bracket_l, - TokenKind_bracket_r, - TokenKind_comma, - TokenKind_dot, - TokenKind_ellipsis, - TokenKind_eq, - TokenKind_ge, - TokenKind_gt, - TokenKind_ident, - TokenKind_keyword_break, - TokenKind_keyword_char, - TokenKind_keyword_const, - TokenKind_keyword_continue, - TokenKind_keyword_do, - TokenKind_keyword_else, - TokenKind_keyword_enum, - TokenKind_keyword_extern, - TokenKind_keyword_for, - TokenKind_keyword_if, - TokenKind_keyword_int, - TokenKind_keyword_long, - TokenKind_keyword_return, - TokenKind_keyword_short, - TokenKind_keyword_sizeof, - TokenKind_keyword_struct, - TokenKind_keyword_typeof, - TokenKind_keyword_void, - TokenKind_keyword_while, - TokenKind_le, - TokenKind_lt, - TokenKind_literal_int, - TokenKind_literal_str, - TokenKind_minus, - TokenKind_minusminus, - TokenKind_ne, - TokenKind_not, - TokenKind_or, - TokenKind_oror, - TokenKind_paren_l, - TokenKind_paren_r, - TokenKind_percent, - TokenKind_plus, - TokenKind_plusplus, - TokenKind_semicolon, - TokenKind_slash, - TokenKind_star, - - // va_start() is currently implemented as a special form due to the current limitation of #define macro. - TokenKind_va_start, -}; -typedef enum TokenKind TokenKind; - -struct Token { - TokenKind kind; - String raw; -}; -typedef struct Token Token; - -const char* token_kind_stringify(TokenKind k) { - if (k == TokenKind_eof) - return "<eof>"; - else if (k == TokenKind_and) - return "&"; - else if (k == TokenKind_andand) - return "&&"; - else if (k == TokenKind_arrow) - return "->"; - else if (k == TokenKind_assign) - return "="; - else if (k == TokenKind_assign_add) - return "+="; - else if (k == TokenKind_assign_sub) - return "-="; - else if (k == TokenKind_brace_l) - return "{"; - else if (k == TokenKind_brace_r) - return "}"; - else if (k == TokenKind_bracket_l) - return "["; - else if (k == TokenKind_bracket_r) - return "]"; - else if (k == TokenKind_comma) - return ","; - else if (k == TokenKind_dot) - return "."; - else if (k == TokenKind_ellipsis) - return "..."; - else if (k == TokenKind_eq) - return "=="; - else if (k == TokenKind_ge) - return ">="; - else if (k == TokenKind_gt) - return ">"; - else if (k == TokenKind_ident) - return "<identifier>"; - else if (k == TokenKind_keyword_break) - return "break"; - else if (k == TokenKind_keyword_char) - return "char"; - else if (k == TokenKind_keyword_const) - return "const"; - else if (k == TokenKind_keyword_continue) - return "continue"; - else if (k == TokenKind_keyword_do) - return "do"; - else if (k == TokenKind_keyword_else) - return "else"; - else if (k == TokenKind_keyword_enum) - return "enum"; - else if (k == TokenKind_keyword_extern) - return "extern"; - else if (k == TokenKind_keyword_for) - return "for"; - else if (k == TokenKind_keyword_if) - return "if"; - else if (k == TokenKind_keyword_int) - return "int"; - else if (k == TokenKind_keyword_long) - return "long"; - else if (k == TokenKind_keyword_return) - return "return"; - else if (k == TokenKind_keyword_short) - return "short"; - else if (k == TokenKind_keyword_sizeof) - return "sizeof"; - else if (k == TokenKind_keyword_struct) - return "struct"; - else if (k == TokenKind_keyword_typeof) - return "typeof"; - else if (k == TokenKind_keyword_void) - return "void"; - else if (k == TokenKind_keyword_while) - return "while"; - else if (k == TokenKind_le) - return "le"; - else if (k == TokenKind_lt) - return "lt"; - else if (k == TokenKind_literal_int) - return "<integer>"; - else if (k == TokenKind_literal_str) - return "<string>"; - else if (k == TokenKind_minus) - return "-"; - else if (k == TokenKind_minusminus) - return "--"; - else if (k == TokenKind_ne) - return "!="; - else if (k == TokenKind_not) - return "!"; - else if (k == TokenKind_or) - return "|"; - else if (k == TokenKind_oror) - return "||"; - else if (k == TokenKind_paren_l) - return "("; - else if (k == TokenKind_paren_r) - return ")"; - else if (k == TokenKind_percent) - return "%"; - else if (k == TokenKind_plus) - return "+"; - else if (k == TokenKind_plusplus) - return "++"; - else if (k == TokenKind_semicolon) - return ";"; - else if (k == TokenKind_slash) - return "/"; - else if (k == TokenKind_star) - return "*"; - else if (k == TokenKind_va_start) - return "va_start"; - else - unreachable(); -} - -const char* token_stringify(Token* t) { - TokenKind k = t->kind; - if (k == TokenKind_ident || k == TokenKind_literal_int || k == TokenKind_literal_str) { - const char* kind_str = token_kind_stringify(k); - char* buf = calloc(t->raw.len + strlen(kind_str) + 3 + 1, sizeof(char)); - sprintf(buf, "%.*s (%s)", t->raw.len, t->raw.data, kind_str); - return buf; - } else { - return token_kind_stringify(k); - } -} - struct Lexer { - PpToken* src; + Token* src; int pos; Token* tokens; int n_tokens; }; typedef struct Lexer Lexer; -Lexer* lexer_new(PpToken* pp_tokens) { +Lexer* lexer_new(Token* pp_tokens) { Lexer* l = calloc(1, sizeof(Lexer)); l->src = pp_tokens; l->tokens = calloc(1024 * 1024, sizeof(Token)); @@ -214,62 +16,12 @@ Lexer* lexer_new(PpToken* pp_tokens) { void tokenize_all(Lexer* l) { int ch; int start; - while (l->src[l->pos].kind != PpTokenKind_eof) { - PpToken* pp_tok = l->src + l->pos; + while (l->src[l->pos].kind != TokenKind_eof) { + Token* pp_tok = l->src + l->pos; Token* tok = l->tokens + l->n_tokens; - PpTokenKind k = pp_tok->kind; + TokenKind k = pp_tok->kind; ++l->pos; - if (k == PpTokenKind_header_name) { - unimplemented(); - } else if (k == PpTokenKind_identifier) { - if (string_equals_cstr(&pp_tok->raw, "break")) { - tok->kind = TokenKind_keyword_break; - } else if (string_equals_cstr(&pp_tok->raw, "char")) { - tok->kind = TokenKind_keyword_char; - } else if (string_equals_cstr(&pp_tok->raw, "const")) { - tok->kind = TokenKind_keyword_const; - } else if (string_equals_cstr(&pp_tok->raw, "continue")) { - tok->kind = TokenKind_keyword_continue; - } else if (string_equals_cstr(&pp_tok->raw, "do")) { - tok->kind = TokenKind_keyword_do; - } else if (string_equals_cstr(&pp_tok->raw, "else")) { - tok->kind = TokenKind_keyword_else; - } else if (string_equals_cstr(&pp_tok->raw, "enum")) { - tok->kind = TokenKind_keyword_enum; - } else if (string_equals_cstr(&pp_tok->raw, "extern")) { - tok->kind = TokenKind_keyword_extern; - } else if (string_equals_cstr(&pp_tok->raw, "for")) { - tok->kind = TokenKind_keyword_for; - } else if (string_equals_cstr(&pp_tok->raw, "if")) { - tok->kind = TokenKind_keyword_if; - } else if (string_equals_cstr(&pp_tok->raw, "int")) { - tok->kind = TokenKind_keyword_int; - } else if (string_equals_cstr(&pp_tok->raw, "long")) { - tok->kind = TokenKind_keyword_long; - } else if (string_equals_cstr(&pp_tok->raw, "return")) { - tok->kind = TokenKind_keyword_return; - } else if (string_equals_cstr(&pp_tok->raw, "short")) { - tok->kind = TokenKind_keyword_short; - } else if (string_equals_cstr(&pp_tok->raw, "sizeof")) { - tok->kind = TokenKind_keyword_sizeof; - } else if (string_equals_cstr(&pp_tok->raw, "struct")) { - tok->kind = TokenKind_keyword_struct; - } else if (string_equals_cstr(&pp_tok->raw, "typedef")) { - tok->kind = TokenKind_keyword_typeof; - } else if (string_equals_cstr(&pp_tok->raw, "void")) { - tok->kind = TokenKind_keyword_void; - } else if (string_equals_cstr(&pp_tok->raw, "while")) { - tok->kind = TokenKind_keyword_while; - } else if (string_equals_cstr(&pp_tok->raw, "va_start")) { - tok->kind = TokenKind_va_start; - } else { - tok->kind = TokenKind_ident; - } - tok->raw = pp_tok->raw; - } else if (k == PpTokenKind_pp_number) { - tok->kind = TokenKind_literal_int; - tok->raw = pp_tok->raw; - } else if (k == PpTokenKind_character_constant) { + if (k == TokenKind_character_constant) { tok->kind = TokenKind_literal_int; ch = pp_tok->raw.data[1]; if (ch == '\\') { @@ -296,87 +48,23 @@ void tokenize_all(Lexer* l) { sprintf(buf, "%d", ch); tok->raw.data = buf; tok->raw.len = strlen(buf); - } else if (k == PpTokenKind_string_literal) { + } else if (k == TokenKind_literal_str) { tok->kind = TokenKind_literal_str; tok->raw.data = pp_tok->raw.data + 1; tok->raw.len = pp_tok->raw.len - 2; - } else if (k == PpTokenKind_punctuator || k == PpTokenKind_other) { - if (string_equals_cstr(&pp_tok->raw, "(")) { - tok->kind = TokenKind_paren_l; - } else if (string_equals_cstr(&pp_tok->raw, ")")) { - tok->kind = TokenKind_paren_r; - } else if (string_equals_cstr(&pp_tok->raw, "{")) { - tok->kind = TokenKind_brace_l; - } else if (string_equals_cstr(&pp_tok->raw, "}")) { - tok->kind = TokenKind_brace_r; - } else if (string_equals_cstr(&pp_tok->raw, "[")) { - tok->kind = TokenKind_bracket_l; - } else if (string_equals_cstr(&pp_tok->raw, "]")) { - tok->kind = TokenKind_bracket_r; - } else if (string_equals_cstr(&pp_tok->raw, ",")) { - tok->kind = TokenKind_comma; - } else if (string_equals_cstr(&pp_tok->raw, ";")) { - tok->kind = TokenKind_semicolon; - } else if (string_equals_cstr(&pp_tok->raw, "+=")) { - tok->kind = TokenKind_assign_add; - } else if (string_equals_cstr(&pp_tok->raw, "++")) { - tok->kind = TokenKind_plusplus; - } else if (string_equals_cstr(&pp_tok->raw, "+")) { - tok->kind = TokenKind_plus; - } else if (string_equals_cstr(&pp_tok->raw, "||")) { - tok->kind = TokenKind_oror; - } else if (string_equals_cstr(&pp_tok->raw, "|")) { - tok->kind = TokenKind_or; - } else if (string_equals_cstr(&pp_tok->raw, "&&")) { - tok->kind = TokenKind_andand; - } else if (string_equals_cstr(&pp_tok->raw, "&")) { - tok->kind = TokenKind_and; - } else if (string_equals_cstr(&pp_tok->raw, "->")) { - tok->kind = TokenKind_arrow; - } else if (string_equals_cstr(&pp_tok->raw, "-=")) { - tok->kind = TokenKind_assign_sub; - } else if (string_equals_cstr(&pp_tok->raw, "--")) { - tok->kind = TokenKind_minusminus; - } else if (string_equals_cstr(&pp_tok->raw, "-")) { - tok->kind = TokenKind_minus; - } else if (string_equals_cstr(&pp_tok->raw, "*")) { - tok->kind = TokenKind_star; - } else if (string_equals_cstr(&pp_tok->raw, "/")) { - tok->kind = TokenKind_slash; - } else if (string_equals_cstr(&pp_tok->raw, "%")) { - tok->kind = TokenKind_percent; - } else if (string_equals_cstr(&pp_tok->raw, "...")) { - tok->kind = TokenKind_ellipsis; - } else if (string_equals_cstr(&pp_tok->raw, ".")) { - tok->kind = TokenKind_dot; - } else if (string_equals_cstr(&pp_tok->raw, "!=")) { - tok->kind = TokenKind_ne; - } else if (string_equals_cstr(&pp_tok->raw, "!")) { - tok->kind = TokenKind_not; - } else if (string_equals_cstr(&pp_tok->raw, "==")) { - tok->kind = TokenKind_eq; - } else if (string_equals_cstr(&pp_tok->raw, "=")) { - tok->kind = TokenKind_assign; - } else if (string_equals_cstr(&pp_tok->raw, "<=")) { - tok->kind = TokenKind_le; - } else if (string_equals_cstr(&pp_tok->raw, "<")) { - tok->kind = TokenKind_lt; - } else if (string_equals_cstr(&pp_tok->raw, ">=")) { - tok->kind = TokenKind_ge; - } else if (string_equals_cstr(&pp_tok->raw, ">")) { - tok->kind = TokenKind_gt; - } else { - fatal_error("unknown token: %.*s", pp_tok->raw.len, pp_tok->raw.data); - } - tok->raw = pp_tok->raw; - } else if (k == PpTokenKind_whitespace) { + } else if (k == TokenKind_other) { + unreachable(); + } else if (k == TokenKind_whitespace) { continue; + } else { + tok->kind = pp_tok->kind; + tok->raw = pp_tok->raw; } ++l->n_tokens; } } -Token* tokenize(PpToken* pp_tokens) { +Token* tokenize(Token* pp_tokens) { Lexer* l = lexer_new(pp_tokens); tokenize_all(l); return l->tokens; |
