aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/main.c2
-rw-r--r--src/parse.c2
-rw-r--r--src/preprocess.c395
-rw-r--r--src/tokenize.c406
-rw-r--r--src/tokenize.h6
5 files changed, 400 insertions, 411 deletions
diff --git a/src/main.c b/src/main.c
index bcb3697..a72b4f0 100644
--- a/src/main.c
+++ b/src/main.c
@@ -34,7 +34,7 @@ int main(int argc, char** argv) {
return 0;
}
- TokenArray* tokens = tokenize(pp_tokens);
+ TokenArray* tokens = convert_pp_tokens_to_tokens(pp_tokens);
Program* prog = parse(tokens);
const char* assembly_filename;
diff --git a/src/parse.c b/src/parse.c
index 0a23aaa..99db2b5 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -2342,7 +2342,7 @@ static int eval(AstNode* e) {
}
bool pp_eval_constant_expression(TokenArray* pp_tokens) {
- TokenArray* tokens = tokenize(pp_tokens);
+ TokenArray* tokens = convert_pp_tokens_to_tokens(pp_tokens);
Parser* p = parser_new(tokens);
AstNode* e = parse_constant_expression(p);
return eval(e) != 0;
diff --git a/src/preprocess.c b/src/preprocess.c
index 34c2fe0..0af146c 100644
--- a/src/preprocess.c
+++ b/src/preprocess.c
@@ -1,12 +1,12 @@
#include "preprocess.h"
#include <assert.h>
-#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
#include "parse.h"
#include "sys.h"
+#include "tokenize.h"
typedef enum {
MacroKind_undef,
@@ -165,397 +165,6 @@ void macroargs_build_json(JsonBuilder* builder, MacroArgArray* macroargs) {
}
typedef struct {
- InFile* src;
- bool at_bol;
- bool expect_header_name;
- TokenArray* pp_tokens;
-} PpLexer;
-
-static PpLexer* pplexer_new(InFile* src) {
- PpLexer* ppl = calloc(1, sizeof(PpLexer));
-
- ppl->src = src;
- ppl->at_bol = true;
- ppl->expect_header_name = false;
- ppl->pp_tokens = calloc(1, sizeof(TokenArray));
- tokens_init(ppl->pp_tokens, 1024 * 16);
-
- return ppl;
-}
-
-static void pplexer_tokenize_pp_directive(PpLexer* ppl, Token* tok) {
- // Skip whitespaces after '#'.
- char c;
- while (isspace((c = infile_peek_char(ppl->src)))) {
- if (c == '\n')
- break;
- infile_next_char(ppl->src);
- }
- // '#' new-line
- if (c == '\n') {
- tok->kind = TokenKind_pp_directive_nop;
- return;
- }
-
- StrBuilder builder;
- strbuilder_init(&builder);
- while (isalnum(infile_peek_char(ppl->src))) {
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- infile_next_char(ppl->src);
- }
- const char* pp_directive_name = builder.buf;
-
- if (builder.len == 0) {
- tok->kind = TokenKind_hash;
- } else if (strcmp(pp_directive_name, "define") == 0) {
- tok->kind = TokenKind_pp_directive_define;
- } else if (strcmp(pp_directive_name, "elif") == 0) {
- tok->kind = TokenKind_pp_directive_elif;
- } else if (strcmp(pp_directive_name, "elifdef") == 0) {
- tok->kind = TokenKind_pp_directive_elifdef;
- } else if (strcmp(pp_directive_name, "elifndef") == 0) {
- tok->kind = TokenKind_pp_directive_elifndef;
- } else if (strcmp(pp_directive_name, "else") == 0) {
- tok->kind = TokenKind_pp_directive_else;
- } else if (strcmp(pp_directive_name, "embed") == 0) {
- tok->kind = TokenKind_pp_directive_embed;
- } else if (strcmp(pp_directive_name, "endif") == 0) {
- tok->kind = TokenKind_pp_directive_endif;
- } else if (strcmp(pp_directive_name, "error") == 0) {
- tok->kind = TokenKind_pp_directive_error;
- } else if (strcmp(pp_directive_name, "if") == 0) {
- tok->kind = TokenKind_pp_directive_if;
- } else if (strcmp(pp_directive_name, "ifdef") == 0) {
- tok->kind = TokenKind_pp_directive_ifdef;
- } else if (strcmp(pp_directive_name, "ifndef") == 0) {
- tok->kind = TokenKind_pp_directive_ifndef;
- } else if (strcmp(pp_directive_name, "include") == 0) {
- ppl->expect_header_name = true;
- tok->kind = TokenKind_pp_directive_include;
- } else if (strcmp(pp_directive_name, "line") == 0) {
- tok->kind = TokenKind_pp_directive_line;
- } else if (strcmp(pp_directive_name, "pragma") == 0) {
- tok->kind = TokenKind_pp_directive_pragma;
- } else if (strcmp(pp_directive_name, "undef") == 0) {
- tok->kind = TokenKind_pp_directive_undef;
- } else if (strcmp(pp_directive_name, "warning") == 0) {
- tok->kind = TokenKind_pp_directive_warning;
- } else {
- tok->kind = TokenKind_pp_directive_non_directive;
- tok->value.string = pp_directive_name;
- }
-}
-
-static void pplexer_tokenize_all(PpLexer* ppl) {
- while (!infile_eof(ppl->src)) {
- Token* tok = tokens_push_new(ppl->pp_tokens);
- tok->loc = ppl->src->loc;
- char c = infile_peek_char(ppl->src);
-
- if (ppl->expect_header_name && c == '"') {
- infile_next_char(ppl->src);
- StrBuilder builder;
- strbuilder_init(&builder);
- strbuilder_append_char(&builder, '"');
- while (1) {
- char ch = infile_peek_char(ppl->src);
- if (ch == '"')
- break;
- strbuilder_append_char(&builder, ch);
- if (ch == '\\') {
- infile_next_char(ppl->src);
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- }
- infile_next_char(ppl->src);
- }
- strbuilder_append_char(&builder, '"');
- infile_next_char(ppl->src);
- tok->kind = TokenKind_header_name;
- tok->value.string = builder.buf;
- ppl->expect_header_name = false;
- } else if (ppl->expect_header_name && c == '<') {
- infile_next_char(ppl->src);
- StrBuilder builder;
- strbuilder_init(&builder);
- strbuilder_append_char(&builder, '<');
- while (1) {
- char ch = infile_peek_char(ppl->src);
- if (ch == '>')
- break;
- strbuilder_append_char(&builder, ch);
- infile_next_char(ppl->src);
- }
- strbuilder_append_char(&builder, '>');
- infile_next_char(ppl->src);
- tok->kind = TokenKind_header_name;
- tok->value.string = builder.buf;
- ppl->expect_header_name = false;
- } else if (c == '(') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_paren_l;
- } else if (c == ')') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_paren_r;
- } else if (c == '{') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_brace_l;
- } else if (c == '}') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_brace_r;
- } else if (c == '[') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_bracket_l;
- } else if (c == ']') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_bracket_r;
- } else if (c == ',') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_comma;
- } else if (c == ':') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_colon;
- } else if (c == ';') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_semicolon;
- } else if (c == '^') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_xor;
- } else {
- tok->kind = TokenKind_xor;
- }
- } else if (c == '?') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_question;
- } else if (c == '~') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_tilde;
- } else if (c == '+') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_add;
- } else if (infile_consume_if(ppl->src, '+')) {
- tok->kind = TokenKind_plusplus;
- } else {
- tok->kind = TokenKind_plus;
- }
- } else if (c == '|') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_or;
- } else if (infile_consume_if(ppl->src, '|')) {
- tok->kind = TokenKind_oror;
- } else {
- tok->kind = TokenKind_or;
- }
- } else if (c == '&') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_and;
- } else if (infile_consume_if(ppl->src, '&')) {
- tok->kind = TokenKind_andand;
- } else {
- tok->kind = TokenKind_and;
- }
- } else if (c == '-') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '>')) {
- tok->kind = TokenKind_arrow;
- } else if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_sub;
- } else if (infile_consume_if(ppl->src, '-')) {
- tok->kind = TokenKind_minusminus;
- } else {
- tok->kind = TokenKind_minus;
- }
- } else if (c == '*') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_mul;
- } else {
- tok->kind = TokenKind_star;
- }
- } else if (c == '/') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_div;
- } else if (infile_consume_if(ppl->src, '/')) {
- while (!infile_eof(ppl->src) && infile_peek_char(ppl->src) != '\n') {
- infile_next_char(ppl->src);
- }
- tok->kind = TokenKind_whitespace;
- } else if (infile_consume_if(ppl->src, '*')) {
- while (infile_peek_char(ppl->src)) {
- if (infile_consume_if(ppl->src, '*')) {
- if (infile_consume_if(ppl->src, '/')) {
- break;
- }
- continue;
- }
- infile_next_char(ppl->src);
- }
- tok->kind = TokenKind_whitespace;
- } else {
- tok->kind = TokenKind_slash;
- }
- } else if (c == '%') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_mod;
- } else {
- tok->kind = TokenKind_percent;
- }
- } else if (c == '.') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '.')) {
- if (infile_consume_if(ppl->src, '.')) {
- tok->kind = TokenKind_ellipsis;
- } else {
- tok->kind = TokenKind_other;
- tok->value.string = "..";
- }
- } else {
- tok->kind = TokenKind_dot;
- }
- } else if (c == '!') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_ne;
- } else {
- tok->kind = TokenKind_not;
- }
- } else if (c == '=') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_eq;
- } else {
- tok->kind = TokenKind_assign;
- }
- } else if (c == '<') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_le;
- } else if (infile_consume_if(ppl->src, '<')) {
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_lshift;
- } else {
- tok->kind = TokenKind_lshift;
- }
- } else {
- tok->kind = TokenKind_lt;
- }
- } else if (c == '>') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_ge;
- } else if (infile_consume_if(ppl->src, '>')) {
- if (infile_consume_if(ppl->src, '=')) {
- tok->kind = TokenKind_assign_rshift;
- } else {
- tok->kind = TokenKind_rshift;
- }
- } else {
- tok->kind = TokenKind_gt;
- }
- } else if (c == '#') {
- infile_next_char(ppl->src);
- if (infile_consume_if(ppl->src, '#')) {
- tok->kind = TokenKind_hashhash;
- } else {
- if (ppl->at_bol) {
- pplexer_tokenize_pp_directive(ppl, tok);
- } else {
- tok->kind = TokenKind_hash;
- }
- }
- } else if (c == '\'') {
- infile_next_char(ppl->src);
- StrBuilder builder;
- strbuilder_init(&builder);
- strbuilder_append_char(&builder, '\'');
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- if (infile_peek_char(ppl->src) == '\\') {
- infile_next_char(ppl->src);
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- }
- strbuilder_append_char(&builder, '\'');
- infile_next_char(ppl->src);
- infile_next_char(ppl->src);
- tok->kind = TokenKind_character_constant;
- tok->value.string = builder.buf;
- } else if (c == '"') {
- infile_next_char(ppl->src);
- StrBuilder builder;
- strbuilder_init(&builder);
- while (1) {
- char ch = infile_peek_char(ppl->src);
- if (ch == '"')
- break;
- strbuilder_append_char(&builder, ch);
- if (ch == '\\') {
- infile_next_char(ppl->src);
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- }
- infile_next_char(ppl->src);
- }
- infile_next_char(ppl->src);
- tok->kind = TokenKind_literal_str;
- tok->value.string = builder.buf;
- } else if (isdigit(c)) {
- // TODO: implement tokenization of pp-number.
- StrBuilder builder;
- strbuilder_init(&builder);
- while (isalnum(infile_peek_char(ppl->src))) {
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- infile_next_char(ppl->src);
- }
- tok->kind = TokenKind_literal_int;
- tok->value.integer = atoi(builder.buf);
- } else if (isalpha(c) || c == '_') {
- StrBuilder builder;
- strbuilder_init(&builder);
- while (isalnum(infile_peek_char(ppl->src)) || infile_peek_char(ppl->src) == '_') {
- strbuilder_append_char(&builder, infile_peek_char(ppl->src));
- infile_next_char(ppl->src);
- }
- tok->kind = TokenKind_ident;
- tok->value.string = builder.buf;
- } else if (c == '\n') {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_newline;
- } else if (isspace(c)) {
- while (isspace((c = infile_peek_char(ppl->src)))) {
- if (c == '\n')
- break;
- infile_next_char(ppl->src);
- }
- if (ppl->at_bol && infile_peek_char(ppl->src) == '#') {
- infile_next_char(ppl->src);
- pplexer_tokenize_pp_directive(ppl, tok);
- } else {
- tok->kind = TokenKind_whitespace;
- }
- } else {
- infile_next_char(ppl->src);
- tok->kind = TokenKind_other;
- char* buf = calloc(2, sizeof(char));
- buf[0] = c;
- tok->value.string = buf;
- }
- ppl->at_bol = tok->kind == TokenKind_newline;
- }
- Token* eof_tok = tokens_push_new(ppl->pp_tokens);
- eof_tok->loc = ppl->src->loc;
- eof_tok->kind = TokenKind_eof;
-}
-
-static TokenArray* pp_tokenize(InFile* src) {
- PpLexer* ppl = pplexer_new(src);
- pplexer_tokenize_all(ppl);
- return ppl->pp_tokens;
-}
-
-typedef struct {
TokenArray* pp_tokens;
int pos;
MacroArray* macros;
@@ -1546,7 +1155,7 @@ static char* get_ducc_include_path() {
static TokenArray* do_preprocess(InFile* src, int depth, MacroArray* macros, StrArray* included_files,
StrArray* user_include_dirs) {
- TokenArray* pp_tokens = pp_tokenize(src);
+ TokenArray* pp_tokens = tokenize(src);
Preprocessor* pp = preprocessor_new(pp_tokens, depth, macros, included_files);
// Ducc's built-in headers has highest priority.
diff --git a/src/tokenize.c b/src/tokenize.c
index cb945e1..fbbc92a 100644
--- a/src/tokenize.c
+++ b/src/tokenize.c
@@ -1,30 +1,412 @@
#include "tokenize.h"
+#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
typedef struct {
- TokenArray* src;
+ InFile* src;
+ bool at_bol;
+ bool expect_header_name;
TokenArray* tokens;
} Lexer;
-static Lexer* lexer_new(TokenArray* pp_tokens) {
+static Lexer* lexer_new(InFile* src) {
Lexer* l = calloc(1, sizeof(Lexer));
- l->src = pp_tokens;
+
+ l->src = src;
+ l->at_bol = true;
+ l->expect_header_name = false;
l->tokens = calloc(1, sizeof(TokenArray));
- // l->tokens need not store whitespace tokens.
- tokens_init(l->tokens, pp_tokens->len / 2);
+ tokens_init(l->tokens, 1024 * 16);
+
return l;
}
-static void tokenize_all(Lexer* l) {
- for (size_t pos = 0; pos < l->src->len; ++pos) {
- Token* pp_tok = &l->src->data[pos];
+static void pplexer_tokenize_pp_directive(Lexer* l, Token* tok) {
+ // Skip whitespaces after '#'.
+ char c;
+ while (isspace((c = infile_peek_char(l->src)))) {
+ if (c == '\n')
+ break;
+ infile_next_char(l->src);
+ }
+ // '#' new-line
+ if (c == '\n') {
+ tok->kind = TokenKind_pp_directive_nop;
+ return;
+ }
+
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ while (isalnum(infile_peek_char(l->src))) {
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ infile_next_char(l->src);
+ }
+ const char* pp_directive_name = builder.buf;
+
+ if (builder.len == 0) {
+ tok->kind = TokenKind_hash;
+ } else if (strcmp(pp_directive_name, "define") == 0) {
+ tok->kind = TokenKind_pp_directive_define;
+ } else if (strcmp(pp_directive_name, "elif") == 0) {
+ tok->kind = TokenKind_pp_directive_elif;
+ } else if (strcmp(pp_directive_name, "elifdef") == 0) {
+ tok->kind = TokenKind_pp_directive_elifdef;
+ } else if (strcmp(pp_directive_name, "elifndef") == 0) {
+ tok->kind = TokenKind_pp_directive_elifndef;
+ } else if (strcmp(pp_directive_name, "else") == 0) {
+ tok->kind = TokenKind_pp_directive_else;
+ } else if (strcmp(pp_directive_name, "embed") == 0) {
+ tok->kind = TokenKind_pp_directive_embed;
+ } else if (strcmp(pp_directive_name, "endif") == 0) {
+ tok->kind = TokenKind_pp_directive_endif;
+ } else if (strcmp(pp_directive_name, "error") == 0) {
+ tok->kind = TokenKind_pp_directive_error;
+ } else if (strcmp(pp_directive_name, "if") == 0) {
+ tok->kind = TokenKind_pp_directive_if;
+ } else if (strcmp(pp_directive_name, "ifdef") == 0) {
+ tok->kind = TokenKind_pp_directive_ifdef;
+ } else if (strcmp(pp_directive_name, "ifndef") == 0) {
+ tok->kind = TokenKind_pp_directive_ifndef;
+ } else if (strcmp(pp_directive_name, "include") == 0) {
+ l->expect_header_name = true;
+ tok->kind = TokenKind_pp_directive_include;
+ } else if (strcmp(pp_directive_name, "line") == 0) {
+ tok->kind = TokenKind_pp_directive_line;
+ } else if (strcmp(pp_directive_name, "pragma") == 0) {
+ tok->kind = TokenKind_pp_directive_pragma;
+ } else if (strcmp(pp_directive_name, "undef") == 0) {
+ tok->kind = TokenKind_pp_directive_undef;
+ } else if (strcmp(pp_directive_name, "warning") == 0) {
+ tok->kind = TokenKind_pp_directive_warning;
+ } else {
+ tok->kind = TokenKind_pp_directive_non_directive;
+ tok->value.string = pp_directive_name;
+ }
+}
+
+static void do_tokenize_all(Lexer* l) {
+ while (!infile_eof(l->src)) {
+ Token* tok = tokens_push_new(l->tokens);
+ tok->loc = l->src->loc;
+ char c = infile_peek_char(l->src);
+
+ if (l->expect_header_name && c == '"') {
+ infile_next_char(l->src);
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ strbuilder_append_char(&builder, '"');
+ while (1) {
+ char ch = infile_peek_char(l->src);
+ if (ch == '"')
+ break;
+ strbuilder_append_char(&builder, ch);
+ if (ch == '\\') {
+ infile_next_char(l->src);
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ }
+ infile_next_char(l->src);
+ }
+ strbuilder_append_char(&builder, '"');
+ infile_next_char(l->src);
+ tok->kind = TokenKind_header_name;
+ tok->value.string = builder.buf;
+ l->expect_header_name = false;
+ } else if (l->expect_header_name && c == '<') {
+ infile_next_char(l->src);
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ strbuilder_append_char(&builder, '<');
+ while (1) {
+ char ch = infile_peek_char(l->src);
+ if (ch == '>')
+ break;
+ strbuilder_append_char(&builder, ch);
+ infile_next_char(l->src);
+ }
+ strbuilder_append_char(&builder, '>');
+ infile_next_char(l->src);
+ tok->kind = TokenKind_header_name;
+ tok->value.string = builder.buf;
+ l->expect_header_name = false;
+ } else if (c == '(') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_paren_l;
+ } else if (c == ')') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_paren_r;
+ } else if (c == '{') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_brace_l;
+ } else if (c == '}') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_brace_r;
+ } else if (c == '[') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_bracket_l;
+ } else if (c == ']') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_bracket_r;
+ } else if (c == ',') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_comma;
+ } else if (c == ':') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_colon;
+ } else if (c == ';') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_semicolon;
+ } else if (c == '^') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_xor;
+ } else {
+ tok->kind = TokenKind_xor;
+ }
+ } else if (c == '?') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_question;
+ } else if (c == '~') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_tilde;
+ } else if (c == '+') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_add;
+ } else if (infile_consume_if(l->src, '+')) {
+ tok->kind = TokenKind_plusplus;
+ } else {
+ tok->kind = TokenKind_plus;
+ }
+ } else if (c == '|') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_or;
+ } else if (infile_consume_if(l->src, '|')) {
+ tok->kind = TokenKind_oror;
+ } else {
+ tok->kind = TokenKind_or;
+ }
+ } else if (c == '&') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_and;
+ } else if (infile_consume_if(l->src, '&')) {
+ tok->kind = TokenKind_andand;
+ } else {
+ tok->kind = TokenKind_and;
+ }
+ } else if (c == '-') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '>')) {
+ tok->kind = TokenKind_arrow;
+ } else if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_sub;
+ } else if (infile_consume_if(l->src, '-')) {
+ tok->kind = TokenKind_minusminus;
+ } else {
+ tok->kind = TokenKind_minus;
+ }
+ } else if (c == '*') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_mul;
+ } else {
+ tok->kind = TokenKind_star;
+ }
+ } else if (c == '/') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_div;
+ } else if (infile_consume_if(l->src, '/')) {
+ while (!infile_eof(l->src) && infile_peek_char(l->src) != '\n') {
+ infile_next_char(l->src);
+ }
+ tok->kind = TokenKind_whitespace;
+ } else if (infile_consume_if(l->src, '*')) {
+ while (infile_peek_char(l->src)) {
+ if (infile_consume_if(l->src, '*')) {
+ if (infile_consume_if(l->src, '/')) {
+ break;
+ }
+ continue;
+ }
+ infile_next_char(l->src);
+ }
+ tok->kind = TokenKind_whitespace;
+ } else {
+ tok->kind = TokenKind_slash;
+ }
+ } else if (c == '%') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_mod;
+ } else {
+ tok->kind = TokenKind_percent;
+ }
+ } else if (c == '.') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '.')) {
+ if (infile_consume_if(l->src, '.')) {
+ tok->kind = TokenKind_ellipsis;
+ } else {
+ tok->kind = TokenKind_other;
+ tok->value.string = "..";
+ }
+ } else {
+ tok->kind = TokenKind_dot;
+ }
+ } else if (c == '!') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_ne;
+ } else {
+ tok->kind = TokenKind_not;
+ }
+ } else if (c == '=') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_eq;
+ } else {
+ tok->kind = TokenKind_assign;
+ }
+ } else if (c == '<') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_le;
+ } else if (infile_consume_if(l->src, '<')) {
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_lshift;
+ } else {
+ tok->kind = TokenKind_lshift;
+ }
+ } else {
+ tok->kind = TokenKind_lt;
+ }
+ } else if (c == '>') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_ge;
+ } else if (infile_consume_if(l->src, '>')) {
+ if (infile_consume_if(l->src, '=')) {
+ tok->kind = TokenKind_assign_rshift;
+ } else {
+ tok->kind = TokenKind_rshift;
+ }
+ } else {
+ tok->kind = TokenKind_gt;
+ }
+ } else if (c == '#') {
+ infile_next_char(l->src);
+ if (infile_consume_if(l->src, '#')) {
+ tok->kind = TokenKind_hashhash;
+ } else {
+ if (l->at_bol) {
+ pplexer_tokenize_pp_directive(l, tok);
+ } else {
+ tok->kind = TokenKind_hash;
+ }
+ }
+ } else if (c == '\'') {
+ infile_next_char(l->src);
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ strbuilder_append_char(&builder, '\'');
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ if (infile_peek_char(l->src) == '\\') {
+ infile_next_char(l->src);
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ }
+ strbuilder_append_char(&builder, '\'');
+ infile_next_char(l->src);
+ infile_next_char(l->src);
+ tok->kind = TokenKind_character_constant;
+ tok->value.string = builder.buf;
+ } else if (c == '"') {
+ infile_next_char(l->src);
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ while (1) {
+ char ch = infile_peek_char(l->src);
+ if (ch == '"')
+ break;
+ strbuilder_append_char(&builder, ch);
+ if (ch == '\\') {
+ infile_next_char(l->src);
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ }
+ infile_next_char(l->src);
+ }
+ infile_next_char(l->src);
+ tok->kind = TokenKind_literal_str;
+ tok->value.string = builder.buf;
+ } else if (isdigit(c)) {
+ // TODO: implement tokenization of pp-number.
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ while (isalnum(infile_peek_char(l->src))) {
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ infile_next_char(l->src);
+ }
+ tok->kind = TokenKind_literal_int;
+ tok->value.integer = atoi(builder.buf);
+ } else if (isalpha(c) || c == '_') {
+ StrBuilder builder;
+ strbuilder_init(&builder);
+ while (isalnum(infile_peek_char(l->src)) || infile_peek_char(l->src) == '_') {
+ strbuilder_append_char(&builder, infile_peek_char(l->src));
+ infile_next_char(l->src);
+ }
+ tok->kind = TokenKind_ident;
+ tok->value.string = builder.buf;
+ } else if (c == '\n') {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_newline;
+ } else if (isspace(c)) {
+ while (isspace((c = infile_peek_char(l->src)))) {
+ if (c == '\n')
+ break;
+ infile_next_char(l->src);
+ }
+ if (l->at_bol && infile_peek_char(l->src) == '#') {
+ infile_next_char(l->src);
+ pplexer_tokenize_pp_directive(l, tok);
+ } else {
+ tok->kind = TokenKind_whitespace;
+ }
+ } else {
+ infile_next_char(l->src);
+ tok->kind = TokenKind_other;
+ char* buf = calloc(2, sizeof(char));
+ buf[0] = c;
+ tok->value.string = buf;
+ }
+ l->at_bol = tok->kind == TokenKind_newline;
+ }
+ Token* eof_tok = tokens_push_new(l->tokens);
+ eof_tok->loc = l->src->loc;
+ eof_tok->kind = TokenKind_eof;
+}
+
+TokenArray* tokenize(InFile* src) {
+ Lexer* l = lexer_new(src);
+ do_tokenize_all(l);
+ return l->tokens;
+}
+
+TokenArray* convert_pp_tokens_to_tokens(TokenArray* pp_tokens) {
+ TokenArray* tokens = calloc(1, sizeof(TokenArray));
+ // tokens need not store whitespace tokens.
+ tokens_init(tokens, pp_tokens->len / 2);
+
+ for (size_t pos = 0; pos < pp_tokens->len; ++pos) {
+ Token* pp_tok = &pp_tokens->data[pos];
TokenKind k = pp_tok->kind;
if (k == TokenKind_removed || k == TokenKind_whitespace || k == TokenKind_newline) {
continue;
}
- Token* tok = tokens_push_new(l->tokens);
+ Token* tok = tokens_push_new(tokens);
tok->loc = pp_tok->loc;
if (k == TokenKind_character_constant) {
tok->kind = TokenKind_literal_int;
@@ -170,10 +552,6 @@ static void tokenize_all(Lexer* l) {
tok->value = pp_tok->value;
}
}
-}
-TokenArray* tokenize(TokenArray* pp_tokens) {
- Lexer* l = lexer_new(pp_tokens);
- tokenize_all(l);
- return l->tokens;
+ return tokens;
}
diff --git a/src/tokenize.h b/src/tokenize.h
index 2e28335..fd334a1 100644
--- a/src/tokenize.h
+++ b/src/tokenize.h
@@ -1,8 +1,10 @@
#ifndef DUCC_TOKENIZE_H
#define DUCC_TOKENIZE_H
-#include "preprocess.h"
+#include "io.h"
+#include "token.h"
-TokenArray* tokenize(TokenArray* pp_tokens);
+TokenArray* tokenize(InFile* src);
+TokenArray* convert_pp_tokens_to_tokens(TokenArray* pp_tokens);
#endif