aboutsummaryrefslogtreecommitdiffhomepage
path: root/tokenize.c
blob: 283755eb7f54aa4c58c5aa7d8e392a72d3279ac5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
struct Lexer {
    TokenArray* src;
    TokenArray* tokens;
};
typedef struct Lexer Lexer;

Lexer* lexer_new(TokenArray* pp_tokens) {
    Lexer* l = calloc(1, sizeof(Lexer));
    l->src = pp_tokens;
    l->tokens = calloc(1, sizeof(TokenArray));
    // l->tokens need not store whitespace tokens.
    tokens_init(l->tokens, pp_tokens->len / 2);
    return l;
}

void tokenize_all(Lexer* l) {
    for (int pos = 0; pos < l->src->len; ++pos) {
        Token* pp_tok = &l->src->data[pos];
        TokenKind k = pp_tok->kind;
        if (k == TokenKind_whitespace || k == TokenKind_newline) {
            continue;
        }
        Token* tok = tokens_push_new(l->tokens);
        tok->loc = pp_tok->loc;
        if (k == TokenKind_character_constant) {
            tok->kind = TokenKind_literal_int;
            int ch = pp_tok->raw.data[1];
            if (ch == '\\') {
                ch = pp_tok->raw.data[2];
                if (ch == 'a') {
                    ch = '\a';
                } else if (ch == 'b') {
                    ch = '\b';
                } else if (ch == 'f') {
                    ch = '\f';
                } else if (ch == 'n') {
                    ch = '\n';
                } else if (ch == 'r') {
                    ch = '\r';
                } else if (ch == 't') {
                    ch = '\t';
                } else if (ch == 'v') {
                    ch = '\v';
                } else if (ch == '0') {
                    ch = '\0';
                }
            }
            char* buf = calloc(4, sizeof(char));
            sprintf(buf, "%d", ch);
            tok->raw.data = buf;
            tok->raw.len = strlen(buf);
        } else if (k == TokenKind_literal_str) {
            tok->kind = TokenKind_literal_str;
            tok->raw.data = pp_tok->raw.data + 1;
            tok->raw.len = pp_tok->raw.len - 2;
        } else if (k == TokenKind_ident) {
            if (string_equals_cstr(&pp_tok->raw, "auto")) {
                tok->kind = TokenKind_keyword_auto;
            } else if (string_equals_cstr(&pp_tok->raw, "break")) {
                tok->kind = TokenKind_keyword_break;
            } else if (string_equals_cstr(&pp_tok->raw, "case")) {
                tok->kind = TokenKind_keyword_case;
            } else if (string_equals_cstr(&pp_tok->raw, "char")) {
                tok->kind = TokenKind_keyword_char;
            } else if (string_equals_cstr(&pp_tok->raw, "const")) {
                tok->kind = TokenKind_keyword_const;
            } else if (string_equals_cstr(&pp_tok->raw, "continue")) {
                tok->kind = TokenKind_keyword_continue;
            } else if (string_equals_cstr(&pp_tok->raw, "default")) {
                tok->kind = TokenKind_keyword_default;
            } else if (string_equals_cstr(&pp_tok->raw, "do")) {
                tok->kind = TokenKind_keyword_do;
            } else if (string_equals_cstr(&pp_tok->raw, "double")) {
                tok->kind = TokenKind_keyword_double;
            } else if (string_equals_cstr(&pp_tok->raw, "else")) {
                tok->kind = TokenKind_keyword_else;
            } else if (string_equals_cstr(&pp_tok->raw, "enum")) {
                tok->kind = TokenKind_keyword_enum;
            } else if (string_equals_cstr(&pp_tok->raw, "extern")) {
                tok->kind = TokenKind_keyword_extern;
            } else if (string_equals_cstr(&pp_tok->raw, "float")) {
                tok->kind = TokenKind_keyword_float;
            } else if (string_equals_cstr(&pp_tok->raw, "for")) {
                tok->kind = TokenKind_keyword_for;
            } else if (string_equals_cstr(&pp_tok->raw, "goto")) {
                tok->kind = TokenKind_keyword_goto;
            } else if (string_equals_cstr(&pp_tok->raw, "if")) {
                tok->kind = TokenKind_keyword_if;
            } else if (string_equals_cstr(&pp_tok->raw, "inline")) {
                tok->kind = TokenKind_keyword_inline;
            } else if (string_equals_cstr(&pp_tok->raw, "int")) {
                tok->kind = TokenKind_keyword_int;
            } else if (string_equals_cstr(&pp_tok->raw, "long")) {
                tok->kind = TokenKind_keyword_long;
            } else if (string_equals_cstr(&pp_tok->raw, "register")) {
                tok->kind = TokenKind_keyword_register;
            } else if (string_equals_cstr(&pp_tok->raw, "restrict")) {
                tok->kind = TokenKind_keyword_restrict;
            } else if (string_equals_cstr(&pp_tok->raw, "return")) {
                tok->kind = TokenKind_keyword_return;
            } else if (string_equals_cstr(&pp_tok->raw, "short")) {
                tok->kind = TokenKind_keyword_short;
            } else if (string_equals_cstr(&pp_tok->raw, "signed")) {
                tok->kind = TokenKind_keyword_signed;
            } else if (string_equals_cstr(&pp_tok->raw, "sizeof")) {
                tok->kind = TokenKind_keyword_sizeof;
            } else if (string_equals_cstr(&pp_tok->raw, "static")) {
                tok->kind = TokenKind_keyword_static;
            } else if (string_equals_cstr(&pp_tok->raw, "struct")) {
                tok->kind = TokenKind_keyword_struct;
            } else if (string_equals_cstr(&pp_tok->raw, "switch")) {
                tok->kind = TokenKind_keyword_switch;
            } else if (string_equals_cstr(&pp_tok->raw, "typedef")) {
                tok->kind = TokenKind_keyword_typedef;
            } else if (string_equals_cstr(&pp_tok->raw, "union")) {
                tok->kind = TokenKind_keyword_union;
            } else if (string_equals_cstr(&pp_tok->raw, "unsigned")) {
                tok->kind = TokenKind_keyword_unsigned;
            } else if (string_equals_cstr(&pp_tok->raw, "void")) {
                tok->kind = TokenKind_keyword_void;
            } else if (string_equals_cstr(&pp_tok->raw, "volatile")) {
                tok->kind = TokenKind_keyword_volatile;
            } else if (string_equals_cstr(&pp_tok->raw, "while")) {
                tok->kind = TokenKind_keyword_while;
            } else if (string_equals_cstr(&pp_tok->raw, "_Bool")) {
                tok->kind = TokenKind_keyword__Bool;
            } else if (string_equals_cstr(&pp_tok->raw, "_Complex")) {
                tok->kind = TokenKind_keyword__Complex;
            } else if (string_equals_cstr(&pp_tok->raw, "_Imaginary")) {
                tok->kind = TokenKind_keyword__Imaginary;
            } else if (string_equals_cstr(&pp_tok->raw, "va_start")) {
                tok->kind = TokenKind_va_start;
                tok->raw = pp_tok->raw;
            } else {
                tok->kind = TokenKind_ident;
                tok->raw = pp_tok->raw;
            }
        } else if (k == TokenKind_other) {
            unreachable();
        } else {
            tok->kind = pp_tok->kind;
            tok->raw = pp_tok->raw;
        }
    }
}

TokenArray* tokenize(TokenArray* pp_tokens) {
    Lexer* l = lexer_new(pp_tokens);
    tokenize_all(l);
    return l->tokens;
}