aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornsfisis <nsfisis@gmail.com>2025-08-03 14:02:54 +0900
committernsfisis <nsfisis@gmail.com>2025-08-15 10:06:21 +0900
commit1364b1303e96221c52568aed0726adc83aee1dc4 (patch)
treed4569d48d543e83593fdd4a8fcfc919a65eb47cb
parente1de8fc36f11ac932707c7113eb4bf3ebc4b1f74 (diff)
downloadducc-1364b1303e96221c52568aed0726adc83aee1dc4.tar.gz
ducc-1364b1303e96221c52568aed0726adc83aee1dc4.tar.zst
ducc-1364b1303e96221c52568aed0726adc83aee1dc4.zip
refactor: merge PpToken and Token
-rw-r--r--main.c2
-rw-r--r--preprocess.c525
-rw-r--r--tokenize.c340
3 files changed, 381 insertions, 486 deletions
diff --git a/main.c b/main.c
index 4b2143d..733f6c1 100644
--- a/main.c
+++ b/main.c
@@ -16,7 +16,7 @@ int main(int argc, char** argv) {
fatal_error("usage: ducc <FILE>");
}
InFile* source = read_all(argv[1]);
- PpToken* pp_tokens = preprocess(source);
+ Token* pp_tokens = preprocess(source);
Token* tokens = tokenize(pp_tokens);
Program* prog = parse(tokens);
codegen(prog);
diff --git a/preprocess.c b/preprocess.c
index 0978dd3..2caa37c 100644
--- a/preprocess.c
+++ b/preprocess.c
@@ -1,36 +1,196 @@
-enum PpTokenKind {
- PpTokenKind_eof,
-
- PpTokenKind_header_name,
- PpTokenKind_identifier,
- PpTokenKind_pp_number,
- PpTokenKind_character_constant,
- PpTokenKind_string_literal,
- PpTokenKind_punctuator,
- PpTokenKind_other,
- PpTokenKind_whitespace,
+enum TokenKind {
+ TokenKind_eof,
+
+ // Only preprocessing phase.
+ TokenKind_hash,
+ TokenKind_hashhash,
+ TokenKind_whitespace,
+ TokenKind_other,
+ TokenKind_character_constant,
+
+ TokenKind_and,
+ TokenKind_andand,
+ TokenKind_arrow,
+ TokenKind_assign,
+ TokenKind_assign_add,
+ TokenKind_assign_sub,
+ TokenKind_brace_l,
+ TokenKind_brace_r,
+ TokenKind_bracket_l,
+ TokenKind_bracket_r,
+ TokenKind_comma,
+ TokenKind_dot,
+ TokenKind_ellipsis,
+ TokenKind_eq,
+ TokenKind_ge,
+ TokenKind_gt,
+ TokenKind_ident,
+ TokenKind_keyword_break,
+ TokenKind_keyword_char,
+ TokenKind_keyword_const,
+ TokenKind_keyword_continue,
+ TokenKind_keyword_do,
+ TokenKind_keyword_else,
+ TokenKind_keyword_enum,
+ TokenKind_keyword_extern,
+ TokenKind_keyword_for,
+ TokenKind_keyword_if,
+ TokenKind_keyword_int,
+ TokenKind_keyword_long,
+ TokenKind_keyword_return,
+ TokenKind_keyword_short,
+ TokenKind_keyword_sizeof,
+ TokenKind_keyword_struct,
+ TokenKind_keyword_typeof,
+ TokenKind_keyword_void,
+ TokenKind_keyword_while,
+ TokenKind_le,
+ TokenKind_lt,
+ TokenKind_literal_int,
+ TokenKind_literal_str,
+ TokenKind_minus,
+ TokenKind_minusminus,
+ TokenKind_ne,
+ TokenKind_not,
+ TokenKind_or,
+ TokenKind_oror,
+ TokenKind_paren_l,
+ TokenKind_paren_r,
+ TokenKind_percent,
+ TokenKind_plus,
+ TokenKind_plusplus,
+ TokenKind_semicolon,
+ TokenKind_slash,
+ TokenKind_star,
+
+ // va_start() is currently implemented as a special form due to the current limitation of #define macro.
+ TokenKind_va_start,
};
-typedef enum PpTokenKind PpTokenKind;
+typedef enum TokenKind TokenKind;
-const char* pp_token_kind_stringify(PpTokenKind kind) {
- if (kind == PpTokenKind_eof)
+const char* token_kind_stringify(TokenKind k) {
+ if (k == TokenKind_eof)
return "<eof>";
- else if (kind == PpTokenKind_header_name)
- return "<header-name>";
- else if (kind == PpTokenKind_identifier)
- return "<identifier>";
- else if (kind == PpTokenKind_pp_number)
- return "<pp-number>";
- else if (kind == PpTokenKind_character_constant)
- return "<character-constant>";
- else if (kind == PpTokenKind_string_literal)
- return "<string-literal>";
- else if (kind == PpTokenKind_punctuator)
- return "<punctuator>";
- else if (kind == PpTokenKind_other)
- return "<other>";
- else if (kind == PpTokenKind_whitespace)
+ else if (k == TokenKind_hash)
+ return "#";
+ else if (k == TokenKind_hashhash)
+ return "##";
+ else if (k == TokenKind_whitespace)
return "<whitespace>";
+ else if (k == TokenKind_other)
+ return "<other>";
+ else if (k == TokenKind_character_constant)
+ return "<character-constant>";
+ else if (k == TokenKind_and)
+ return "&";
+ else if (k == TokenKind_andand)
+ return "&&";
+ else if (k == TokenKind_arrow)
+ return "->";
+ else if (k == TokenKind_assign)
+ return "=";
+ else if (k == TokenKind_assign_add)
+ return "+=";
+ else if (k == TokenKind_assign_sub)
+ return "-=";
+ else if (k == TokenKind_brace_l)
+ return "{";
+ else if (k == TokenKind_brace_r)
+ return "}";
+ else if (k == TokenKind_bracket_l)
+ return "[";
+ else if (k == TokenKind_bracket_r)
+ return "]";
+ else if (k == TokenKind_comma)
+ return ",";
+ else if (k == TokenKind_dot)
+ return ".";
+ else if (k == TokenKind_ellipsis)
+ return "...";
+ else if (k == TokenKind_eq)
+ return "==";
+ else if (k == TokenKind_ge)
+ return ">=";
+ else if (k == TokenKind_gt)
+ return ">";
+ else if (k == TokenKind_ident)
+ return "<identifier>";
+ else if (k == TokenKind_keyword_break)
+ return "break";
+ else if (k == TokenKind_keyword_char)
+ return "char";
+ else if (k == TokenKind_keyword_const)
+ return "const";
+ else if (k == TokenKind_keyword_continue)
+ return "continue";
+ else if (k == TokenKind_keyword_do)
+ return "do";
+ else if (k == TokenKind_keyword_else)
+ return "else";
+ else if (k == TokenKind_keyword_enum)
+ return "enum";
+ else if (k == TokenKind_keyword_extern)
+ return "extern";
+ else if (k == TokenKind_keyword_for)
+ return "for";
+ else if (k == TokenKind_keyword_if)
+ return "if";
+ else if (k == TokenKind_keyword_int)
+ return "int";
+ else if (k == TokenKind_keyword_long)
+ return "long";
+ else if (k == TokenKind_keyword_return)
+ return "return";
+ else if (k == TokenKind_keyword_short)
+ return "short";
+ else if (k == TokenKind_keyword_sizeof)
+ return "sizeof";
+ else if (k == TokenKind_keyword_struct)
+ return "struct";
+ else if (k == TokenKind_keyword_typeof)
+ return "typeof";
+ else if (k == TokenKind_keyword_void)
+ return "void";
+ else if (k == TokenKind_keyword_while)
+ return "while";
+ else if (k == TokenKind_le)
+ return "le";
+ else if (k == TokenKind_lt)
+ return "lt";
+ else if (k == TokenKind_literal_int)
+ return "<integer>";
+ else if (k == TokenKind_literal_str)
+ return "<string>";
+ else if (k == TokenKind_minus)
+ return "-";
+ else if (k == TokenKind_minusminus)
+ return "--";
+ else if (k == TokenKind_ne)
+ return "!=";
+ else if (k == TokenKind_not)
+ return "!";
+ else if (k == TokenKind_or)
+ return "|";
+ else if (k == TokenKind_oror)
+ return "||";
+ else if (k == TokenKind_paren_l)
+ return "(";
+ else if (k == TokenKind_paren_r)
+ return ")";
+ else if (k == TokenKind_percent)
+ return "%";
+ else if (k == TokenKind_plus)
+ return "+";
+ else if (k == TokenKind_plusplus)
+ return "++";
+ else if (k == TokenKind_semicolon)
+ return ";";
+ else if (k == TokenKind_slash)
+ return "/";
+ else if (k == TokenKind_star)
+ return "*";
+ else if (k == TokenKind_va_start)
+ return "va_start";
else
unreachable();
}
@@ -41,18 +201,24 @@ struct SourceLocation {
};
typedef struct SourceLocation SourceLocation;
-struct PpToken {
- PpTokenKind kind;
+struct Token {
+ TokenKind kind;
String raw;
SourceLocation loc;
};
-typedef struct PpToken PpToken;
+typedef struct Token Token;
-const char* pp_token_stringify(PpToken* tok) {
- const char* kind_str = pp_token_kind_stringify(tok->kind);
- char* buf = calloc(tok->raw.len + strlen(kind_str) + 3 + 1, sizeof(char));
- sprintf(buf, "%.*s (%s)", tok->raw.len, tok->raw.data, kind_str);
- return buf;
+const char* token_stringify(Token* t) {
+ TokenKind k = t->kind;
+ if (k == TokenKind_other || k == TokenKind_character_constant || k == TokenKind_whitespace ||
+ k == TokenKind_ident || k == TokenKind_literal_int || k == TokenKind_literal_str) {
+ const char* kind_str = token_kind_stringify(k);
+ char* buf = calloc(t->raw.len + strlen(kind_str) + 3 + 1, sizeof(char));
+ sprintf(buf, "%.*s (%s)", t->raw.len, t->raw.data, kind_str);
+ return buf;
+ } else {
+ return token_kind_stringify(k);
+ }
}
enum PpMacroKind {
@@ -80,7 +246,7 @@ struct PpMacro {
PpMacroKind kind;
String name;
size_t n_replacements;
- PpToken* replacements;
+ Token* replacements;
};
typedef struct PpMacro PpMacro;
@@ -95,7 +261,7 @@ struct Preprocessor {
int line;
char* src;
int pos;
- PpToken* pp_tokens;
+ Token* pp_tokens;
int n_pp_tokens;
PpMacros* pp_macros;
int include_depth;
@@ -105,7 +271,7 @@ struct Preprocessor {
};
typedef struct Preprocessor Preprocessor;
-PpToken* do_preprocess(InFile* src, int depth, PpMacros* pp_macros);
+Token* do_preprocess(InFile* src, int depth, PpMacros* pp_macros);
PpMacros* pp_macros_new() {
PpMacros* pp_macros = calloc(1, sizeof(PpMacros));
@@ -139,8 +305,8 @@ void add_predefined_macros(PpMacros* pp_macros) {
m->name.len = strlen("__ducc__");
m->name.data = "__ducc__";
m->n_replacements = 1;
- m->replacements = calloc(1, sizeof(PpToken));
- m->replacements[0].kind = PpTokenKind_pp_number;
+ m->replacements = calloc(1, sizeof(Token));
+ m->replacements[0].kind = TokenKind_literal_int;
m->replacements[0].raw.len = strlen("1");
m->replacements[0].raw.data = "1";
pp_macros->len += 1;
@@ -158,9 +324,9 @@ void add_predefined_macros(PpMacros* pp_macros) {
pp_macros->len += 1;
}
-int count_pp_tokens(PpToken* pp_tokens) {
+int count_pp_tokens(Token* pp_tokens) {
int n = 0;
- while (pp_tokens[n].kind != PpTokenKind_eof) {
+ while (pp_tokens[n].kind != TokenKind_eof) {
++n;
}
return n;
@@ -175,7 +341,7 @@ Preprocessor* preprocessor_new(InFile* src, int include_depth, PpMacros* pp_macr
pp->filename = src->filename;
pp->line = 1;
pp->src = src->buf;
- pp->pp_tokens = calloc(1024 * 1024, sizeof(PpToken));
+ pp->pp_tokens = calloc(1024 * 1024, sizeof(Token));
pp->pp_macros = pp_macros;
pp->include_depth = include_depth;
pp->include_paths = calloc(16, sizeof(String));
@@ -209,104 +375,104 @@ void pp_tokenize_all(Preprocessor* pp) {
int ch;
int start;
while (pp->src[pp->pos]) {
- PpToken* tok = pp->pp_tokens + pp->n_pp_tokens;
+ Token* tok = pp->pp_tokens + pp->n_pp_tokens;
tok->loc.filename = pp->filename;
tok->loc.line = pp->line;
char c = pp->src[pp->pos];
++pp->pos;
if (c == '(') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_paren_l;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == ')') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_paren_r;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '{') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_brace_l;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '}') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_brace_r;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '[') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_bracket_l;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == ']') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_bracket_r;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == ',') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_comma;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == ';') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_semicolon;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '+') {
if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_assign_add;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (pp->src[pp->pos] == '+') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_plusplus;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_plus;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '|') {
if (pp->src[pp->pos] == '|') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_oror;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_or;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '&') {
if (pp->src[pp->pos] == '&') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_andand;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_and;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '-') {
if (pp->src[pp->pos] == '>') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_arrow;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_assign_sub;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (pp->src[pp->pos] == '-') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_minusminus;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_minus;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '*') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_star;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '/') {
@@ -316,7 +482,7 @@ void pp_tokenize_all(Preprocessor* pp) {
while (pp->src[pp->pos] && pp->src[pp->pos] != '\n' && pp->src[pp->pos] != '\r') {
++pp->pos;
}
- tok->kind = PpTokenKind_whitespace;
+ tok->kind = TokenKind_whitespace;
tok->raw.len = pp->pos - start;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (pp->src[pp->pos] == '*') {
@@ -332,16 +498,16 @@ void pp_tokenize_all(Preprocessor* pp) {
}
++pp->pos;
}
- tok->kind = PpTokenKind_whitespace;
+ tok->kind = TokenKind_whitespace;
tok->raw.len = pp->pos - start;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_slash;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '%') {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_percent;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else if (c == '.') {
@@ -349,72 +515,71 @@ void pp_tokenize_all(Preprocessor* pp) {
++pp->pos;
if (pp->src[pp->pos] == '.') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_ellipsis;
tok->raw.len = 3;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- --pp->pos;
- tok->kind = PpTokenKind_punctuator;
- tok->raw.len = 1;
+ tok->kind = TokenKind_other;
+ tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_dot;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '!') {
if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_ne;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_not;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '=') {
if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_eq;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_assign;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '<') {
if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_le;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_lt;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '>') {
if (pp->src[pp->pos] == '=') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_ge;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_gt;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
} else if (c == '#') {
if (pp->src[pp->pos] == '#') {
++pp->pos;
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_hashhash;
tok->raw.len = 2;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_punctuator;
+ tok->kind = TokenKind_hash;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
@@ -424,7 +589,7 @@ void pp_tokenize_all(Preprocessor* pp) {
++pp->pos;
}
pp->pos += 2;
- tok->kind = PpTokenKind_character_constant;
+ tok->kind = TokenKind_character_constant;
tok->raw.data = pp->src + start;
tok->raw.len = pp->pos - start;
} else if (c == '"') {
@@ -439,7 +604,7 @@ void pp_tokenize_all(Preprocessor* pp) {
++pp->pos;
}
++pp->pos;
- tok->kind = PpTokenKind_string_literal;
+ tok->kind = TokenKind_literal_str;
tok->raw.data = pp->src + start;
tok->raw.len = pp->pos - start;
} else if (isdigit(c)) {
@@ -448,7 +613,7 @@ void pp_tokenize_all(Preprocessor* pp) {
while (isdigit(pp->src[pp->pos])) {
++pp->pos;
}
- tok->kind = PpTokenKind_pp_number;
+ tok->kind = TokenKind_literal_int;
tok->raw.data = pp->src + start;
tok->raw.len = pp->pos - start;
} else if (isalpha(c) || c == '_') {
@@ -459,16 +624,58 @@ void pp_tokenize_all(Preprocessor* pp) {
}
tok->raw.data = pp->src + start;
tok->raw.len = pp->pos - start;
- tok->kind = PpTokenKind_identifier;
+ if (string_equals_cstr(&tok->raw, "break")) {
+ tok->kind = TokenKind_keyword_break;
+ } else if (string_equals_cstr(&tok->raw, "char")) {
+ tok->kind = TokenKind_keyword_char;
+ } else if (string_equals_cstr(&tok->raw, "const")) {
+ tok->kind = TokenKind_keyword_const;
+ } else if (string_equals_cstr(&tok->raw, "continue")) {
+ tok->kind = TokenKind_keyword_continue;
+ } else if (string_equals_cstr(&tok->raw, "do")) {
+ tok->kind = TokenKind_keyword_do;
+ } else if (string_equals_cstr(&tok->raw, "else")) {
+ tok->kind = TokenKind_keyword_else;
+ } else if (string_equals_cstr(&tok->raw, "enum")) {
+ tok->kind = TokenKind_keyword_enum;
+ } else if (string_equals_cstr(&tok->raw, "extern")) {
+ tok->kind = TokenKind_keyword_extern;
+ } else if (string_equals_cstr(&tok->raw, "for")) {
+ tok->kind = TokenKind_keyword_for;
+ } else if (string_equals_cstr(&tok->raw, "if")) {
+ tok->kind = TokenKind_keyword_if;
+ } else if (string_equals_cstr(&tok->raw, "int")) {
+ tok->kind = TokenKind_keyword_int;
+ } else if (string_equals_cstr(&tok->raw, "long")) {
+ tok->kind = TokenKind_keyword_long;
+ } else if (string_equals_cstr(&tok->raw, "return")) {
+ tok->kind = TokenKind_keyword_return;
+ } else if (string_equals_cstr(&tok->raw, "short")) {
+ tok->kind = TokenKind_keyword_short;
+ } else if (string_equals_cstr(&tok->raw, "sizeof")) {
+ tok->kind = TokenKind_keyword_sizeof;
+ } else if (string_equals_cstr(&tok->raw, "struct")) {
+ tok->kind = TokenKind_keyword_struct;
+ } else if (string_equals_cstr(&tok->raw, "typedef")) {
+ tok->kind = TokenKind_keyword_typeof;
+ } else if (string_equals_cstr(&tok->raw, "void")) {
+ tok->kind = TokenKind_keyword_void;
+ } else if (string_equals_cstr(&tok->raw, "while")) {
+ tok->kind = TokenKind_keyword_while;
+ } else if (string_equals_cstr(&tok->raw, "va_start")) {
+ tok->kind = TokenKind_va_start;
+ } else {
+ tok->kind = TokenKind_ident;
+ }
} else if (isspace(c)) {
if (c == '\n' || c == '\r') {
++pp->line;
}
- tok->kind = PpTokenKind_whitespace;
+ tok->kind = TokenKind_whitespace;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
} else {
- tok->kind = PpTokenKind_other;
+ tok->kind = TokenKind_other;
tok->raw.len = 1;
tok->raw.data = pp->src + pp->pos - tok->raw.len;
}
@@ -476,8 +683,8 @@ void pp_tokenize_all(Preprocessor* pp) {
}
}
-PpToken* skip_whitespace(PpToken* tok) {
- while (tok->kind != PpTokenKind_eof && tok->kind == PpTokenKind_whitespace) {
+Token* skip_whitespace(Token* tok) {
+ while (tok->kind != TokenKind_eof && tok->kind == TokenKind_whitespace) {
++tok;
}
return tok;
@@ -493,9 +700,9 @@ int string_contains_newline(String* s) {
return 0;
}
-PpToken* find_next_newline(PpToken* tok) {
- while (tok->kind != PpTokenKind_eof) {
- if (tok->kind == PpTokenKind_whitespace && string_contains_newline(&tok->raw)) {
+Token* find_next_newline(Token* tok) {
+ while (tok->kind != TokenKind_eof) {
+ if (tok->kind == TokenKind_whitespace && string_contains_newline(&tok->raw)) {
return tok;
}
++tok;
@@ -503,23 +710,23 @@ PpToken* find_next_newline(PpToken* tok) {
return NULL;
}
-void make_token_whitespace(PpToken* tok) {
- tok->kind = PpTokenKind_whitespace;
+void make_token_whitespace(Token* tok) {
+ tok->kind = TokenKind_whitespace;
tok->raw.len = 0;
tok->raw.data = NULL;
}
-void remove_directive_tokens(PpToken* start, PpToken* end) {
- PpToken* tok = start;
+void remove_directive_tokens(Token* start, Token* end) {
+ Token* tok = start;
while (tok != end) {
make_token_whitespace(tok);
++tok;
}
}
-PpToken* process_endif_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "endif")) {
+Token* process_endif_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
+ if (tok2->kind == TokenKind_ident && string_equals_cstr(&tok2->raw, "endif")) {
++tok2;
pp->skip_pp_tokens = 0;
remove_directive_tokens(tok, tok2);
@@ -528,9 +735,9 @@ PpToken* process_endif_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-PpToken* process_else_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "else")) {
+Token* process_else_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
+ if (tok2->kind == TokenKind_keyword_else) {
++tok2;
pp->skip_pp_tokens = 1 - pp->skip_pp_tokens;
remove_directive_tokens(tok, tok2);
@@ -539,13 +746,13 @@ PpToken* process_else_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-PpToken* process_ifdef_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "ifdef")) {
+Token* process_ifdef_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
+ if (tok2->kind == TokenKind_ident && string_equals_cstr(&tok2->raw, "ifdef")) {
++tok2;
tok2 = skip_whitespace(tok2);
- if (tok2->kind == PpTokenKind_identifier) {
- PpToken* name = tok2;
+ if (tok2->kind == TokenKind_ident) {
+ Token* name = tok2;
++tok2;
pp->skip_pp_tokens = find_pp_macro(pp, &name->raw) == -1;
}
@@ -555,13 +762,13 @@ PpToken* process_ifdef_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-PpToken* process_ifndef_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "ifndef")) {
+Token* process_ifndef_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
+ if (tok2->kind == TokenKind_ident && string_equals_cstr(&tok2->raw, "ifndef")) {
++tok2;
tok2 = skip_whitespace(tok2);
- if (tok2->kind == PpTokenKind_identifier) {
- PpToken* name = tok2;
+ if (tok2->kind == TokenKind_ident) {
+ Token* name = tok2;
++tok2;
pp->skip_pp_tokens = find_pp_macro(pp, &name->raw) != -1;
}
@@ -571,23 +778,23 @@ PpToken* process_ifndef_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-PpToken* read_include_header_name(PpToken* tok2, String* include_name) {
- if (tok2->kind == PpTokenKind_string_literal) {
+Token* read_include_header_name(Token* tok2, String* include_name) {
+ if (tok2->kind == TokenKind_literal_str) {
*include_name = tok2->raw;
++tok2;
return tok2;
- } else if (tok2->kind == PpTokenKind_punctuator && string_equals_cstr(&tok2->raw, "<")) {
+ } else if (tok2->kind == TokenKind_lt) {
char* include_name_start = tok2->raw.data;
++tok2;
int include_name_len = 0;
- while (tok2->kind != PpTokenKind_eof) {
- if (tok2->kind == PpTokenKind_punctuator && string_equals_cstr(&tok2->raw, ">")) {
+ while (tok2->kind != TokenKind_eof) {
+ if (tok2->kind == TokenKind_gt) {
break;
}
include_name_len += tok2->raw.len;
++tok2;
}
- if (tok2->kind == PpTokenKind_eof) {
+ if (tok2->kind == TokenKind_eof) {
fatal_error("invalid #include: <> not balanced");
}
++tok2;
@@ -616,8 +823,8 @@ const char* resolve_include_name(Preprocessor* pp, String* include_name) {
}
}
-PpToken* replace_pp_tokens(Preprocessor* pp, PpToken* dest_start, PpToken* dest_end, int n_source_tokens,
- PpToken* source_tokens) {
+Token* replace_pp_tokens(Preprocessor* pp, Token* dest_start, Token* dest_end, int n_source_tokens,
+ Token* source_tokens) {
int n_tokens_to_remove = dest_end - dest_start;
int n_tokens_after_dest = (pp->pp_tokens + pp->n_pp_tokens) - dest_end;
int shift_amount;
@@ -625,35 +832,35 @@ PpToken* replace_pp_tokens(Preprocessor* pp, PpToken* dest_start, PpToken* dest_
if (n_tokens_to_remove < n_source_tokens) {
// Move existing tokens backward to make room.
shift_amount = n_source_tokens - n_tokens_to_remove;
- memmove(dest_end + shift_amount, dest_end, n_tokens_after_dest * sizeof(PpToken));
+ memmove(dest_end + shift_amount, dest_end, n_tokens_after_dest * sizeof(Token));
pp->n_pp_tokens += shift_amount;
} else if (n_source_tokens < n_tokens_to_remove) {
// Move existing tokens forward to reduce room.
shift_amount = n_tokens_to_remove - n_source_tokens;
- memmove(dest_start + n_source_tokens, dest_end, n_tokens_after_dest * sizeof(PpToken));
+ memmove(dest_start + n_source_tokens, dest_end, n_tokens_after_dest * sizeof(Token));
pp->n_pp_tokens -= shift_amount;
- memset(pp->pp_tokens + pp->n_pp_tokens, 0, shift_amount * sizeof(PpToken));
+ memset(pp->pp_tokens + pp->n_pp_tokens, 0, shift_amount * sizeof(Token));
}
- memcpy(dest_start, source_tokens, n_source_tokens * sizeof(PpToken));
+ memcpy(dest_start, source_tokens, n_source_tokens * sizeof(Token));
return dest_start + n_source_tokens;
}
-PpToken* expand_include_directive(Preprocessor* pp, PpToken* tok, PpToken* tok2, const char* include_name_buf) {
+Token* expand_include_directive(Preprocessor* pp, Token* tok, Token* tok2, const char* include_name_buf) {
InFile* include_source = read_all(include_name_buf);
if (!include_source) {
fatal_error("cannot open include file: %s", include_name_buf);
}
- PpToken* include_pp_tokens = do_preprocess(include_source, pp->include_depth + 1, pp->pp_macros);
+ Token* include_pp_tokens = do_preprocess(include_source, pp->include_depth + 1, pp->pp_macros);
return replace_pp_tokens(pp, tok, tok2 + 1, count_pp_tokens(include_pp_tokens), include_pp_tokens);
}
-PpToken* process_include_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
+Token* process_include_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "include")) {
+ if (tok2->kind == TokenKind_ident && string_equals_cstr(&tok2->raw, "include")) {
++tok2;
tok2 = skip_whitespace(tok2);
String* include_name = calloc(1, sizeof(String));
@@ -667,20 +874,20 @@ PpToken* process_include_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-PpToken* process_define_directive(Preprocessor* pp, PpToken* tok) {
- PpToken* tok2 = skip_whitespace(tok + 1);
- PpToken* tok3 = NULL;
+Token* process_define_directive(Preprocessor* pp, Token* tok) {
+ Token* tok2 = skip_whitespace(tok + 1);
+ Token* tok3 = NULL;
PpMacro* pp_macro;
int i;
- if (tok2->kind == PpTokenKind_identifier && string_equals_cstr(&tok2->raw, "define")) {
+ if (tok2->kind == TokenKind_ident && string_equals_cstr(&tok2->raw, "define")) {
++tok2;
tok2 = skip_whitespace(tok2);
- if (tok2->kind == PpTokenKind_identifier) {
- PpToken* macro_name = tok2;
+ if (tok2->kind == TokenKind_ident) {
+ Token* macro_name = tok2;
++tok2;
- if (tok2->kind == PpTokenKind_punctuator && string_equals_cstr(&tok2->raw, "(")) {
+ if (tok2->kind == TokenKind_paren_l) {
++tok2;
- if (tok2->kind == PpTokenKind_punctuator && string_equals_cstr(&tok2->raw, ")")) {
+ if (tok2->kind == TokenKind_paren_r) {
++tok2;
} else {
fatal_error("%s:%d: invalid function-like macro syntax (#define %.*s)", macro_name->loc.filename,
@@ -692,7 +899,7 @@ PpToken* process_define_directive(Preprocessor* pp, PpToken* tok) {
pp_macro->kind = PpMacroKind_func;
pp_macro->name = macro_name->raw;
pp_macro->n_replacements = tok3 - tok2;
- pp_macro->replacements = calloc(pp_macro->n_replacements, sizeof(PpToken));
+ pp_macro->replacements = calloc(pp_macro->n_replacements, sizeof(Token));
for (i = 0; i < pp_macro->n_replacements; ++i) {
pp_macro->replacements[i] = tok2[i];
}
@@ -705,7 +912,7 @@ PpToken* process_define_directive(Preprocessor* pp, PpToken* tok) {
pp_macro->kind = PpMacroKind_obj;
pp_macro->name = macro_name->raw;
pp_macro->n_replacements = tok3 - tok2;
- pp_macro->replacements = calloc(pp_macro->n_replacements, sizeof(PpToken));
+ pp_macro->replacements = calloc(pp_macro->n_replacements, sizeof(Token));
for (i = 0; i < pp_macro->n_replacements; ++i) {
pp_macro->replacements[i] = tok2[i];
}
@@ -721,7 +928,7 @@ PpToken* process_define_directive(Preprocessor* pp, PpToken* tok) {
return NULL;
}
-int expand_macro(Preprocessor* pp, PpToken* tok) {
+int expand_macro(Preprocessor* pp, Token* tok) {
int pp_macro_idx = find_pp_macro(pp, &tok->raw);
if (pp_macro_idx == -1) {
return 0;
@@ -744,15 +951,15 @@ int expand_macro(Preprocessor* pp, PpToken* tok) {
tok[i].loc = original_loc;
}
} else if (pp_macro->kind == PpMacroKind_builtin_file) {
- PpToken* file_tok = calloc(1, sizeof(PpToken));
- file_tok->kind = PpTokenKind_string_literal;
+ Token* file_tok = calloc(1, sizeof(Token));
+ file_tok->kind = TokenKind_literal_str;
file_tok->raw.len = strlen(tok->loc.filename) + 2;
file_tok->raw.data = calloc(file_tok->raw.len, sizeof(char));
sprintf(file_tok->raw.data, "\"%s\"", tok->loc.filename);
replace_pp_tokens(pp, tok, tok + 1, 1, file_tok);
} else if (pp_macro->kind == PpMacroKind_builtin_line) {
- PpToken* line_tok = calloc(1, sizeof(PpToken));
- line_tok->kind = PpTokenKind_pp_number;
+ Token* line_tok = calloc(1, sizeof(Token));
+ line_tok->kind = TokenKind_literal_int;
line_tok->raw.data = calloc(10, sizeof(char));
sprintf(line_tok->raw.data, "%d", tok->loc.line);
line_tok->raw.len = strlen(line_tok->raw.data);
@@ -764,11 +971,11 @@ int expand_macro(Preprocessor* pp, PpToken* tok) {
}
void process_pp_directives(Preprocessor* pp) {
- PpToken* tok = pp->pp_tokens;
+ Token* tok = pp->pp_tokens;
- while (tok->kind != PpTokenKind_eof) {
- if (tok->kind == PpTokenKind_punctuator && string_equals_cstr(&tok->raw, "#")) {
- PpToken* next_tok;
+ while (tok->kind != TokenKind_eof) {
+ if (tok->kind == TokenKind_hash) {
+ Token* next_tok;
if ((next_tok = process_endif_directive(pp, tok)) != NULL) {
tok = next_tok;
@@ -797,7 +1004,7 @@ void process_pp_directives(Preprocessor* pp) {
}
} else if (skip_pp_tokens(pp)) {
make_token_whitespace(tok);
- } else if (tok->kind == PpTokenKind_identifier) {
+ } else if (tok->kind == TokenKind_ident) {
int expanded = expand_macro(pp, tok);
if (expanded) {
// A macro may expand to another macro. Re-scan the expanded tokens.
@@ -809,12 +1016,12 @@ void process_pp_directives(Preprocessor* pp) {
}
}
-void pp_dump(PpToken* t, int include_whitespace) {
- for (; t->kind != PpTokenKind_eof; ++t) {
- if (t->kind == PpTokenKind_whitespace && !include_whitespace) {
+void pp_dump(Token* t, int include_whitespace) {
+ for (; t->kind != TokenKind_eof; ++t) {
+ if (t->kind == TokenKind_whitespace && !include_whitespace) {
continue;
}
- fprintf(stderr, "%s\n", pp_token_stringify(t));
+ fprintf(stderr, "%s\n", token_stringify(t));
}
}
@@ -825,7 +1032,7 @@ char* get_ducc_include_path() {
return buf;
}
-PpToken* do_preprocess(InFile* src, int depth, PpMacros* pp_macros) {
+Token* do_preprocess(InFile* src, int depth, PpMacros* pp_macros) {
Preprocessor* pp = preprocessor_new(src, depth, pp_macros);
add_include_path(pp, get_ducc_include_path());
add_include_path(pp, "/usr/include/x86_64-linux-gnu");
@@ -835,7 +1042,7 @@ PpToken* do_preprocess(InFile* src, int depth, PpMacros* pp_macros) {
return pp->pp_tokens;
}
-PpToken* preprocess(InFile* src) {
+Token* preprocess(InFile* src) {
PpMacros* pp_macros = pp_macros_new();
add_predefined_macros(pp_macros);
return do_preprocess(src, 0, pp_macros);
diff --git a/tokenize.c b/tokenize.c
index 9bc14d6..ff66525 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -1,210 +1,12 @@
-enum TokenKind {
- TokenKind_eof,
-
- TokenKind_and,
- TokenKind_andand,
- TokenKind_arrow,
- TokenKind_assign,
- TokenKind_assign_add,
- TokenKind_assign_sub,
- TokenKind_brace_l,
- TokenKind_brace_r,
- TokenKind_bracket_l,
- TokenKind_bracket_r,
- TokenKind_comma,
- TokenKind_dot,
- TokenKind_ellipsis,
- TokenKind_eq,
- TokenKind_ge,
- TokenKind_gt,
- TokenKind_ident,
- TokenKind_keyword_break,
- TokenKind_keyword_char,
- TokenKind_keyword_const,
- TokenKind_keyword_continue,
- TokenKind_keyword_do,
- TokenKind_keyword_else,
- TokenKind_keyword_enum,
- TokenKind_keyword_extern,
- TokenKind_keyword_for,
- TokenKind_keyword_if,
- TokenKind_keyword_int,
- TokenKind_keyword_long,
- TokenKind_keyword_return,
- TokenKind_keyword_short,
- TokenKind_keyword_sizeof,
- TokenKind_keyword_struct,
- TokenKind_keyword_typeof,
- TokenKind_keyword_void,
- TokenKind_keyword_while,
- TokenKind_le,
- TokenKind_lt,
- TokenKind_literal_int,
- TokenKind_literal_str,
- TokenKind_minus,
- TokenKind_minusminus,
- TokenKind_ne,
- TokenKind_not,
- TokenKind_or,
- TokenKind_oror,
- TokenKind_paren_l,
- TokenKind_paren_r,
- TokenKind_percent,
- TokenKind_plus,
- TokenKind_plusplus,
- TokenKind_semicolon,
- TokenKind_slash,
- TokenKind_star,
-
- // va_start() is currently implemented as a special form due to the current limitation of #define macro.
- TokenKind_va_start,
-};
-typedef enum TokenKind TokenKind;
-
-struct Token {
- TokenKind kind;
- String raw;
-};
-typedef struct Token Token;
-
-const char* token_kind_stringify(TokenKind k) {
- if (k == TokenKind_eof)
- return "<eof>";
- else if (k == TokenKind_and)
- return "&";
- else if (k == TokenKind_andand)
- return "&&";
- else if (k == TokenKind_arrow)
- return "->";
- else if (k == TokenKind_assign)
- return "=";
- else if (k == TokenKind_assign_add)
- return "+=";
- else if (k == TokenKind_assign_sub)
- return "-=";
- else if (k == TokenKind_brace_l)
- return "{";
- else if (k == TokenKind_brace_r)
- return "}";
- else if (k == TokenKind_bracket_l)
- return "[";
- else if (k == TokenKind_bracket_r)
- return "]";
- else if (k == TokenKind_comma)
- return ",";
- else if (k == TokenKind_dot)
- return ".";
- else if (k == TokenKind_ellipsis)
- return "...";
- else if (k == TokenKind_eq)
- return "==";
- else if (k == TokenKind_ge)
- return ">=";
- else if (k == TokenKind_gt)
- return ">";
- else if (k == TokenKind_ident)
- return "<identifier>";
- else if (k == TokenKind_keyword_break)
- return "break";
- else if (k == TokenKind_keyword_char)
- return "char";
- else if (k == TokenKind_keyword_const)
- return "const";
- else if (k == TokenKind_keyword_continue)
- return "continue";
- else if (k == TokenKind_keyword_do)
- return "do";
- else if (k == TokenKind_keyword_else)
- return "else";
- else if (k == TokenKind_keyword_enum)
- return "enum";
- else if (k == TokenKind_keyword_extern)
- return "extern";
- else if (k == TokenKind_keyword_for)
- return "for";
- else if (k == TokenKind_keyword_if)
- return "if";
- else if (k == TokenKind_keyword_int)
- return "int";
- else if (k == TokenKind_keyword_long)
- return "long";
- else if (k == TokenKind_keyword_return)
- return "return";
- else if (k == TokenKind_keyword_short)
- return "short";
- else if (k == TokenKind_keyword_sizeof)
- return "sizeof";
- else if (k == TokenKind_keyword_struct)
- return "struct";
- else if (k == TokenKind_keyword_typeof)
- return "typeof";
- else if (k == TokenKind_keyword_void)
- return "void";
- else if (k == TokenKind_keyword_while)
- return "while";
- else if (k == TokenKind_le)
- return "le";
- else if (k == TokenKind_lt)
- return "lt";
- else if (k == TokenKind_literal_int)
- return "<integer>";
- else if (k == TokenKind_literal_str)
- return "<string>";
- else if (k == TokenKind_minus)
- return "-";
- else if (k == TokenKind_minusminus)
- return "--";
- else if (k == TokenKind_ne)
- return "!=";
- else if (k == TokenKind_not)
- return "!";
- else if (k == TokenKind_or)
- return "|";
- else if (k == TokenKind_oror)
- return "||";
- else if (k == TokenKind_paren_l)
- return "(";
- else if (k == TokenKind_paren_r)
- return ")";
- else if (k == TokenKind_percent)
- return "%";
- else if (k == TokenKind_plus)
- return "+";
- else if (k == TokenKind_plusplus)
- return "++";
- else if (k == TokenKind_semicolon)
- return ";";
- else if (k == TokenKind_slash)
- return "/";
- else if (k == TokenKind_star)
- return "*";
- else if (k == TokenKind_va_start)
- return "va_start";
- else
- unreachable();
-}
-
-const char* token_stringify(Token* t) {
- TokenKind k = t->kind;
- if (k == TokenKind_ident || k == TokenKind_literal_int || k == TokenKind_literal_str) {
- const char* kind_str = token_kind_stringify(k);
- char* buf = calloc(t->raw.len + strlen(kind_str) + 3 + 1, sizeof(char));
- sprintf(buf, "%.*s (%s)", t->raw.len, t->raw.data, kind_str);
- return buf;
- } else {
- return token_kind_stringify(k);
- }
-}
-
struct Lexer {
- PpToken* src;
+ Token* src;
int pos;
Token* tokens;
int n_tokens;
};
typedef struct Lexer Lexer;
-Lexer* lexer_new(PpToken* pp_tokens) {
+Lexer* lexer_new(Token* pp_tokens) {
Lexer* l = calloc(1, sizeof(Lexer));
l->src = pp_tokens;
l->tokens = calloc(1024 * 1024, sizeof(Token));
@@ -214,62 +16,12 @@ Lexer* lexer_new(PpToken* pp_tokens) {
void tokenize_all(Lexer* l) {
int ch;
int start;
- while (l->src[l->pos].kind != PpTokenKind_eof) {
- PpToken* pp_tok = l->src + l->pos;
+ while (l->src[l->pos].kind != TokenKind_eof) {
+ Token* pp_tok = l->src + l->pos;
Token* tok = l->tokens + l->n_tokens;
- PpTokenKind k = pp_tok->kind;
+ TokenKind k = pp_tok->kind;
++l->pos;
- if (k == PpTokenKind_header_name) {
- unimplemented();
- } else if (k == PpTokenKind_identifier) {
- if (string_equals_cstr(&pp_tok->raw, "break")) {
- tok->kind = TokenKind_keyword_break;
- } else if (string_equals_cstr(&pp_tok->raw, "char")) {
- tok->kind = TokenKind_keyword_char;
- } else if (string_equals_cstr(&pp_tok->raw, "const")) {
- tok->kind = TokenKind_keyword_const;
- } else if (string_equals_cstr(&pp_tok->raw, "continue")) {
- tok->kind = TokenKind_keyword_continue;
- } else if (string_equals_cstr(&pp_tok->raw, "do")) {
- tok->kind = TokenKind_keyword_do;
- } else if (string_equals_cstr(&pp_tok->raw, "else")) {
- tok->kind = TokenKind_keyword_else;
- } else if (string_equals_cstr(&pp_tok->raw, "enum")) {
- tok->kind = TokenKind_keyword_enum;
- } else if (string_equals_cstr(&pp_tok->raw, "extern")) {
- tok->kind = TokenKind_keyword_extern;
- } else if (string_equals_cstr(&pp_tok->raw, "for")) {
- tok->kind = TokenKind_keyword_for;
- } else if (string_equals_cstr(&pp_tok->raw, "if")) {
- tok->kind = TokenKind_keyword_if;
- } else if (string_equals_cstr(&pp_tok->raw, "int")) {
- tok->kind = TokenKind_keyword_int;
- } else if (string_equals_cstr(&pp_tok->raw, "long")) {
- tok->kind = TokenKind_keyword_long;
- } else if (string_equals_cstr(&pp_tok->raw, "return")) {
- tok->kind = TokenKind_keyword_return;
- } else if (string_equals_cstr(&pp_tok->raw, "short")) {
- tok->kind = TokenKind_keyword_short;
- } else if (string_equals_cstr(&pp_tok->raw, "sizeof")) {
- tok->kind = TokenKind_keyword_sizeof;
- } else if (string_equals_cstr(&pp_tok->raw, "struct")) {
- tok->kind = TokenKind_keyword_struct;
- } else if (string_equals_cstr(&pp_tok->raw, "typedef")) {
- tok->kind = TokenKind_keyword_typeof;
- } else if (string_equals_cstr(&pp_tok->raw, "void")) {
- tok->kind = TokenKind_keyword_void;
- } else if (string_equals_cstr(&pp_tok->raw, "while")) {
- tok->kind = TokenKind_keyword_while;
- } else if (string_equals_cstr(&pp_tok->raw, "va_start")) {
- tok->kind = TokenKind_va_start;
- } else {
- tok->kind = TokenKind_ident;
- }
- tok->raw = pp_tok->raw;
- } else if (k == PpTokenKind_pp_number) {
- tok->kind = TokenKind_literal_int;
- tok->raw = pp_tok->raw;
- } else if (k == PpTokenKind_character_constant) {
+ if (k == TokenKind_character_constant) {
tok->kind = TokenKind_literal_int;
ch = pp_tok->raw.data[1];
if (ch == '\\') {
@@ -296,87 +48,23 @@ void tokenize_all(Lexer* l) {
sprintf(buf, "%d", ch);
tok->raw.data = buf;
tok->raw.len = strlen(buf);
- } else if (k == PpTokenKind_string_literal) {
+ } else if (k == TokenKind_literal_str) {
tok->kind = TokenKind_literal_str;
tok->raw.data = pp_tok->raw.data + 1;
tok->raw.len = pp_tok->raw.len - 2;
- } else if (k == PpTokenKind_punctuator || k == PpTokenKind_other) {
- if (string_equals_cstr(&pp_tok->raw, "(")) {
- tok->kind = TokenKind_paren_l;
- } else if (string_equals_cstr(&pp_tok->raw, ")")) {
- tok->kind = TokenKind_paren_r;
- } else if (string_equals_cstr(&pp_tok->raw, "{")) {
- tok->kind = TokenKind_brace_l;
- } else if (string_equals_cstr(&pp_tok->raw, "}")) {
- tok->kind = TokenKind_brace_r;
- } else if (string_equals_cstr(&pp_tok->raw, "[")) {
- tok->kind = TokenKind_bracket_l;
- } else if (string_equals_cstr(&pp_tok->raw, "]")) {
- tok->kind = TokenKind_bracket_r;
- } else if (string_equals_cstr(&pp_tok->raw, ",")) {
- tok->kind = TokenKind_comma;
- } else if (string_equals_cstr(&pp_tok->raw, ";")) {
- tok->kind = TokenKind_semicolon;
- } else if (string_equals_cstr(&pp_tok->raw, "+=")) {
- tok->kind = TokenKind_assign_add;
- } else if (string_equals_cstr(&pp_tok->raw, "++")) {
- tok->kind = TokenKind_plusplus;
- } else if (string_equals_cstr(&pp_tok->raw, "+")) {
- tok->kind = TokenKind_plus;
- } else if (string_equals_cstr(&pp_tok->raw, "||")) {
- tok->kind = TokenKind_oror;
- } else if (string_equals_cstr(&pp_tok->raw, "|")) {
- tok->kind = TokenKind_or;
- } else if (string_equals_cstr(&pp_tok->raw, "&&")) {
- tok->kind = TokenKind_andand;
- } else if (string_equals_cstr(&pp_tok->raw, "&")) {
- tok->kind = TokenKind_and;
- } else if (string_equals_cstr(&pp_tok->raw, "->")) {
- tok->kind = TokenKind_arrow;
- } else if (string_equals_cstr(&pp_tok->raw, "-=")) {
- tok->kind = TokenKind_assign_sub;
- } else if (string_equals_cstr(&pp_tok->raw, "--")) {
- tok->kind = TokenKind_minusminus;
- } else if (string_equals_cstr(&pp_tok->raw, "-")) {
- tok->kind = TokenKind_minus;
- } else if (string_equals_cstr(&pp_tok->raw, "*")) {
- tok->kind = TokenKind_star;
- } else if (string_equals_cstr(&pp_tok->raw, "/")) {
- tok->kind = TokenKind_slash;
- } else if (string_equals_cstr(&pp_tok->raw, "%")) {
- tok->kind = TokenKind_percent;
- } else if (string_equals_cstr(&pp_tok->raw, "...")) {
- tok->kind = TokenKind_ellipsis;
- } else if (string_equals_cstr(&pp_tok->raw, ".")) {
- tok->kind = TokenKind_dot;
- } else if (string_equals_cstr(&pp_tok->raw, "!=")) {
- tok->kind = TokenKind_ne;
- } else if (string_equals_cstr(&pp_tok->raw, "!")) {
- tok->kind = TokenKind_not;
- } else if (string_equals_cstr(&pp_tok->raw, "==")) {
- tok->kind = TokenKind_eq;
- } else if (string_equals_cstr(&pp_tok->raw, "=")) {
- tok->kind = TokenKind_assign;
- } else if (string_equals_cstr(&pp_tok->raw, "<=")) {
- tok->kind = TokenKind_le;
- } else if (string_equals_cstr(&pp_tok->raw, "<")) {
- tok->kind = TokenKind_lt;
- } else if (string_equals_cstr(&pp_tok->raw, ">=")) {
- tok->kind = TokenKind_ge;
- } else if (string_equals_cstr(&pp_tok->raw, ">")) {
- tok->kind = TokenKind_gt;
- } else {
- fatal_error("unknown token: %.*s", pp_tok->raw.len, pp_tok->raw.data);
- }
- tok->raw = pp_tok->raw;
- } else if (k == PpTokenKind_whitespace) {
+ } else if (k == TokenKind_other) {
+ unreachable();
+ } else if (k == TokenKind_whitespace) {
continue;
+ } else {
+ tok->kind = pp_tok->kind;
+ tok->raw = pp_tok->raw;
}
++l->n_tokens;
}
}
-Token* tokenize(PpToken* pp_tokens) {
+Token* tokenize(Token* pp_tokens) {
Lexer* l = lexer_new(pp_tokens);
tokenize_all(l);
return l->tokens;