From 48d9ec8aef4c3e7f3574346a6cf6a1fa3d725561 Mon Sep 17 00:00:00 2001 From: nsfisis Date: Sun, 25 Jan 2026 18:09:51 +0900 Subject: refactor term parsing --- src/jq/compile.zig | 7 +- src/jq/parse.zig | 141 ++++++++++++++++---------------------- src/jq/tokenize.zig | 193 ++++++++++++++++++++++++++++++++++++---------------- 3 files changed, 197 insertions(+), 144 deletions(-) (limited to 'src') diff --git a/src/jq/compile.zig b/src/jq/compile.zig index ec6ef63..fb2a691 100644 --- a/src/jq/compile.zig +++ b/src/jq/compile.zig @@ -68,9 +68,12 @@ fn compileExpr(allocator: std.mem.Allocator, compile_allocator: std.mem.Allocato switch (ast.*) { .identity => try instrs.append(allocator, .nop), - .array_index => |index| { - const index_instrs = try compileExpr(allocator, compile_allocator, index); + .array_index => |arr_idx| { + const base_instrs = try compileExpr(allocator, compile_allocator, arr_idx.base); + defer allocator.free(base_instrs); + const index_instrs = try compileExpr(allocator, compile_allocator, arr_idx.index); defer allocator.free(index_instrs); + try instrs.appendSlice(allocator, base_instrs); try instrs.append(allocator, .subexp_begin); try instrs.appendSlice(allocator, index_instrs); try instrs.append(allocator, .subexp_end); diff --git a/src/jq/parse.zig b/src/jq/parse.zig index 6a76861..5df1d14 100644 --- a/src/jq/parse.zig +++ b/src/jq/parse.zig @@ -45,7 +45,7 @@ pub const BinaryOp = enum { pub const Ast = union(AstKind) { identity, - array_index: *Ast, + array_index: struct { base: *Ast, index: *Ast }, object_key: []const u8, literal: *jv.Value, binary_expr: struct { op: BinaryOp, lhs: *Ast, rhs: *Ast }, @@ -300,101 +300,75 @@ fn parseExpr7(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, } fn parseTerm(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, tokens: *TokenStream) !*Ast { - const first_token = try tokens.peek(); - if (first_token.kind() == .number) { - _ = try tokens.next(); - const number_value = try allocator.create(jv.Value); - const f = first_token.number; - const i: i64 = @intFromFloat(f); - if (@as(f64, @floatFromInt(i)) == f) { - number_value.* = .{ .integer = i }; + var result = try parsePrimary(allocator, parse_allocator, tokens); + while (true) { + const token = tokens.peek() catch return result; + if (token.kind() == .bracket_left) { + result = try parseSuffix(allocator, parse_allocator, tokens, result); } else { - number_value.* = .{ .float = f }; + break; } - const number_node = try parse_allocator.create(Ast); - number_node.* = .{ .literal = number_value }; - return number_node; - } - - if (first_token.kind() == .keyword_null) { - _ = try tokens.next(); - const null_value = try allocator.create(jv.Value); - null_value.* = .null; - const null_node = try parse_allocator.create(Ast); - null_node.* = .{ .literal = null_value }; - return null_node; - } - - if (first_token.kind() == .keyword_true) { - _ = try tokens.next(); - const true_value = try allocator.create(jv.Value); - true_value.* = .{ .bool = true }; - const true_node = try parse_allocator.create(Ast); - true_node.* = .{ .literal = true_value }; - return true_node; - } - - if (first_token.kind() == .keyword_false) { - _ = try tokens.next(); - const false_value = try allocator.create(jv.Value); - false_value.* = .{ .bool = false }; - const false_node = try parse_allocator.create(Ast); - false_node.* = .{ .literal = false_value }; - return false_node; } + return result; +} - _ = try tokens.expect(.dot); - - const next_token = try tokens.peek(); - switch (next_token.kind()) { - .identifier => { - return parseFieldAccess(allocator, parse_allocator, tokens); +fn parsePrimary(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, tokens: *TokenStream) !*Ast { + const first_token = try tokens.peek(); + switch (first_token) { + .keyword_null => { + _ = try tokens.next(); + const null_value = try allocator.create(jv.Value); + null_value.* = .null; + const null_node = try parse_allocator.create(Ast); + null_node.* = .{ .literal = null_value }; + return null_node; + }, + .keyword_true => { + _ = try tokens.next(); + const true_value = try allocator.create(jv.Value); + true_value.* = .{ .bool = true }; + const true_node = try parse_allocator.create(Ast); + true_node.* = .{ .literal = true_value }; + return true_node; + }, + .keyword_false => { + _ = try tokens.next(); + const false_value = try allocator.create(jv.Value); + false_value.* = .{ .bool = false }; + const false_node = try parse_allocator.create(Ast); + false_node.* = .{ .literal = false_value }; + return false_node; }, - .bracket_left => { - return parseIndexAccess(allocator, parse_allocator, tokens); + .number => |f| { + _ = try tokens.next(); + const number_value = try allocator.create(jv.Value); + const i: i64 = @intFromFloat(f); + if (@as(f64, @floatFromInt(i)) == f) { + number_value.* = .{ .integer = i }; + } else { + number_value.* = .{ .float = f }; + } + const number_node = try parse_allocator.create(Ast); + number_node.* = .{ .literal = number_value }; + return number_node; }, - .end, - .pipe, - .comma, - .slash_slash, - .equal, - .pipe_equal, - .slash_slash_equal, - .plus_equal, - .minus_equal, - .asterisk_equal, - .slash_equal, - .percent_equal, - .keyword_or, - .keyword_and, - .equal_equal, - .not_equal, - .less_than, - .greater_than, - .less_than_equal, - .greater_than_equal, - .plus, - .minus, - .asterisk, - .slash, - .percent, - => { + .dot => { + _ = try tokens.next(); const ast = try parse_allocator.create(Ast); ast.* = .identity; return ast; }, + .field => |name| { + _ = try tokens.next(); + const ast = try parse_allocator.create(Ast); + ast.* = .{ .object_key = try allocator.dupe(u8, name) }; + return ast; + }, else => return error.InvalidQuery, } } -fn parseFieldAccess(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, tokens: *TokenStream) !*Ast { - const token = try tokens.expect(.identifier); - const ast = try parse_allocator.create(Ast); - ast.* = .{ .object_key = try allocator.dupe(u8, token.identifier) }; - return ast; -} - -fn parseIndexAccess(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, tokens: *TokenStream) !*Ast { +fn parseSuffix(allocator: std.mem.Allocator, parse_allocator: std.mem.Allocator, tokens: *TokenStream, base: *Ast) !*Ast { _ = try tokens.expect(.bracket_left); const index_token = try tokens.expect(.number); _ = try tokens.expect(.bracket_right); @@ -403,7 +377,8 @@ fn parseIndexAccess(allocator: std.mem.Allocator, parse_allocator: std.mem.Alloc index_value.* = .{ .integer = @intFromFloat(index_token.number) }; const index_node = try parse_allocator.create(Ast); index_node.* = .{ .literal = index_value }; + const ast = try parse_allocator.create(Ast); - ast.* = .{ .array_index = index_node }; + ast.* = .{ .array_index = .{ .base = base, .index = index_node } }; return ast; } diff --git a/src/jq/tokenize.zig b/src/jq/tokenize.zig index 60643de..f5e1a70 100644 --- a/src/jq/tokenize.zig +++ b/src/jq/tokenize.zig @@ -74,6 +74,7 @@ pub const TokenKind = enum { number, string, format, + field, }; pub const Token = union(TokenKind) { @@ -141,6 +142,7 @@ pub const Token = union(TokenKind) { number: f64, string: []const u8, format: []const u8, + field: []const u8, pub fn kind(self: @This()) TokenKind { return self; @@ -242,6 +244,22 @@ fn tokenizeIdentifier(allocator: std.mem.Allocator, reader: *std.Io.Reader, firs return buffer.toOwnedSlice(allocator); } +fn tokenizeField(allocator: std.mem.Allocator, reader: *std.Io.Reader, first: u8) ![]const u8 { + var buffer = try std.ArrayList(u8).initCapacity(allocator, 16); + try buffer.append(allocator, first); + + while (try peekByte(reader)) |c| { + if (isIdentifierContinue(c)) { + try buffer.append(allocator, c); + reader.toss(1); + } else { + break; + } + } + + return buffer.toOwnedSlice(allocator); +} + fn tokenizeNumber(allocator: std.mem.Allocator, reader: *std.Io.Reader, first: u8) !f64 { var buffer = try std.ArrayList(u8).initCapacity(allocator, 16); try buffer.append(allocator, first); @@ -484,7 +502,18 @@ pub fn tokenize(allocator: std.mem.Allocator, reader: *std.Io.Reader) ![]Token { '+' => if (try takeByteIf(reader, '=')) .plus_equal else .plus, ',' => .comma, '-' => if (try takeByteIf(reader, '=')) .minus_equal else .minus, - '.' => if (try takeByteIf(reader, '.')) .dot_dot else .dot, + '.' => blk: { + if (try takeByteIf(reader, '.')) { + break :blk .dot_dot; + } + if (try peekByte(reader)) |next| { + if (isIdentifierStart(next)) { + reader.toss(1); + break :blk Token{ .field = try tokenizeField(allocator, reader, next) }; + } + } + break :blk .dot; + }, '/' => if (try takeByteIf(reader, '/')) if (try takeByteIf(reader, '=')) .slash_slash_equal else .slash_slash else if (try takeByteIf(reader, '=')) @@ -679,12 +708,11 @@ test "tokenize identifier in complex query" { var reader = std.Io.Reader.fixed(".foo | bar::baz"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar::baz", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar::baz", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize keywords" { @@ -751,12 +779,11 @@ test "tokenize with comments" { ); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment at end of input" { @@ -766,10 +793,9 @@ test "tokenize comment at end of input" { var reader = std.Io.Reader.fixed(".foo # comment without newline"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(3, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.end, tokens[2]); + try std.testing.expectEqual(2, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.end, tokens[1]); } test "tokenize comment with line continuation" { @@ -783,12 +809,11 @@ test "tokenize comment with line continuation" { ); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment with escaped backslash before newline" { @@ -802,12 +827,11 @@ test "tokenize comment with escaped backslash before newline" { ); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment with three backslashes before newline" { @@ -822,12 +846,11 @@ test "tokenize comment with three backslashes before newline" { ); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment with CRLF" { @@ -837,12 +860,11 @@ test "tokenize comment with CRLF" { var reader = std.Io.Reader.fixed(".foo # comment\r\n| bar"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment with line continuation before CRLF" { @@ -852,12 +874,11 @@ test "tokenize comment with line continuation before CRLF" { var reader = std.Io.Reader.fixed(".foo # comment \\\r\nthis is also comment\r\n| bar"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("bar", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("bar", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize comment with single CR does not end comment" { @@ -867,12 +888,11 @@ test "tokenize comment with single CR does not end comment" { var reader = std.Io.Reader.fixed(".foo # comment\r| bar\n| baz"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("baz", tokens[3].identifier); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("baz", tokens[2].identifier); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize floating point numbers" { @@ -986,12 +1006,11 @@ test "tokenize format in expression" { var reader = std.Io.Reader.fixed(".foo | @base64"); const tokens = try tokenize(allocator.allocator(), &reader); - try std.testing.expectEqual(5, tokens.len); - try std.testing.expectEqual(.dot, tokens[0]); - try std.testing.expectEqualStrings("foo", tokens[1].identifier); - try std.testing.expectEqual(.pipe, tokens[2]); - try std.testing.expectEqualStrings("base64", tokens[3].format); - try std.testing.expectEqual(.end, tokens[4]); + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.pipe, tokens[1]); + try std.testing.expectEqualStrings("base64", tokens[2].format); + try std.testing.expectEqual(.end, tokens[3]); } test "tokenize format invalid" { @@ -1150,3 +1169,59 @@ test "tokenize lone low surrogate" { try std.testing.expectError(error.InvalidUnicodeEscape, result); } + +test "tokenize field" { + var allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer allocator.deinit(); + + var reader = std.Io.Reader.fixed(".foo"); + const tokens = try tokenize(allocator.allocator(), &reader); + + try std.testing.expectEqual(2, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.end, tokens[1]); +} + +test "tokenize chained fields" { + var allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer allocator.deinit(); + + var reader = std.Io.Reader.fixed(".foo.bar.baz"); + const tokens = try tokenize(allocator.allocator(), &reader); + + try std.testing.expectEqual(4, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqualStrings("bar", tokens[1].field); + try std.testing.expectEqualStrings("baz", tokens[2].field); + try std.testing.expectEqual(.end, tokens[3]); +} + +test "tokenize field does not support namespace" { + var allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer allocator.deinit(); + + // Unlike identifiers, field access does not support namespace syntax + var reader = std.Io.Reader.fixed(".foo::bar"); + const tokens = try tokenize(allocator.allocator(), &reader); + + try std.testing.expectEqual(5, tokens.len); + try std.testing.expectEqualStrings("foo", tokens[0].field); + try std.testing.expectEqual(.colon, tokens[1]); + try std.testing.expectEqual(.colon, tokens[2]); + try std.testing.expectEqualStrings("bar", tokens[3].identifier); + try std.testing.expectEqual(.end, tokens[4]); +} + +test "tokenize dot with space before identifier" { + var allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer allocator.deinit(); + + // ". foo" should be [dot, identifier("foo")], not [field("foo")] + var reader = std.Io.Reader.fixed(". foo"); + const tokens = try tokenize(allocator.allocator(), &reader); + + try std.testing.expectEqual(3, tokens.len); + try std.testing.expectEqual(.dot, tokens[0]); + try std.testing.expectEqualStrings("foo", tokens[1].identifier); + try std.testing.expectEqual(.end, tokens[2]); +} -- cgit v1.3-1-g0d28