fix: xml parser can now handle repeated root arrays

This commit is contained in:
Simon Hartcher 2025-04-29 16:56:40 +10:00
parent a420528a59
commit 7438642d91
2 changed files with 126 additions and 22 deletions

View file

@ -25,6 +25,7 @@ pub const Element = struct {
tag: []const u8,
attributes: AttributeList,
children: ContentList,
next_sibling: ?*Element = null,
fn init(tag: []const u8, alloc: Allocator) Element {
return .{
@ -347,7 +348,7 @@ fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document {
_ = ctx.eatWs();
try trySkipComments(ctx, allocator);
doc.root = (try tryParseElement(ctx, allocator)) orelse return error.InvalidDocument;
doc.root = (try tryParseElement(ctx, allocator, null)) orelse return error.InvalidDocument;
_ = ctx.eatWs();
try trySkipComments(ctx, allocator);
@ -415,12 +416,12 @@ fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 {
return try dupeAndUnescape(alloc, ctx.source[begin..end]);
}
fn parseContent(ctx: *ParseContext, alloc: Allocator) ParseError!Content {
fn parseContent(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) ParseError!Content {
if (try tryParseCharData(ctx, alloc)) |cd| {
return Content{ .CharData = cd };
} else if (try tryParseComment(ctx, alloc)) |comment| {
return Content{ .Comment = comment };
} else if (try tryParseElement(ctx, alloc)) |elem| {
} else if (try tryParseElement(ctx, alloc, parent)) |elem| {
return Content{ .Element = elem };
} else {
return error.UnexpectedCharacter;
@ -440,7 +441,7 @@ fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute {
return attr;
}
fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element {
fn tryParseElement(ctx: *ParseContext, alloc: Allocator, parent: ?*Element) !?*Element {
const start = ctx.offset;
if (!ctx.eat('<')) return null;
const tag = parseNameNoDupe(ctx) catch {
@ -469,7 +470,7 @@ fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element {
break;
}
const content = try parseContent(ctx, alloc);
const content = try parseContent(ctx, alloc, element);
try element.children.append(content);
}
@ -480,6 +481,23 @@ fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element {
_ = ctx.eatWs();
try ctx.expect('>');
if (parent) |p| {
var last_element: ?*Element = null;
for (0..p.children.items.len) |i| {
const child = p.children.items[p.children.items.len - i - 1];
if (child == .Element) {
last_element = child.Element;
break;
}
}
if (last_element) |lc| {
lc.next_sibling = element;
}
}
return element;
}
@ -490,13 +508,13 @@ test "tryParseElement" {
{
var ctx = ParseContext.init("<= a='b'/>");
try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc));
try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc, null));
try testing.expectEqual(@as(?u8, '<'), ctx.peek());
}
{
var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
const elem = try tryParseElement(&ctx, alloc);
const elem = try tryParseElement(&ctx, alloc, null);
try testing.expectEqualSlices(u8, elem.?.tag, "python");
const size_attr = elem.?.attributes.items[0];
@ -510,14 +528,14 @@ test "tryParseElement" {
{
var ctx = ParseContext.init("<python>test</python>");
const elem = try tryParseElement(&ctx, alloc);
const elem = try tryParseElement(&ctx, alloc, null);
try testing.expectEqualSlices(u8, elem.?.tag, "python");
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test");
}
{
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
const elem = try tryParseElement(&ctx, alloc);
const elem = try tryParseElement(&ctx, alloc, null);
try testing.expectEqualSlices(u8, elem.?.tag, "a");
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b");
try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c");

View file

@ -1,6 +1,7 @@
const std = @import("std");
const xml = @import("xml.zig");
const date = @import("date");
const sm = @import("service_manifest");
const log = std.log.scoped(.xml_shaper);
@ -94,6 +95,53 @@ pub fn parse(comptime T: type, source: []const u8, options: ParseOptions) !Parse
return Parsed(T).init(arena_allocator, try parseInternal(T, root, opts), parsed);
}
pub const XmlArrayStyle = enum {
collection, // Has a container element and list of child elements
repeated_root, // Repeats the same element without a container, e.g. S3 ListBucketResult
};
fn detectArrayStyle(comptime T: type, element: *xml.Element, options: ParseOptions) !XmlArrayStyle {
_ = options;
if (@typeInfo(T) != .@"struct") {
return .collection;
}
// does the element have child elements that match our expected struct?
const field_names = comptime blk: {
var result: [std.meta.fieldNames(T).len][]const u8 = undefined;
for (std.meta.fieldNames(T), 0..) |field_name, i| {
result[i] = if (@hasDecl(T, "fieldNameFor"))
T.fieldNameFor(undefined, field_name)
else
field_name;
}
break :blk result;
};
var matching_fields: usize = 0;
for (element.children.items) |content| {
switch (content) {
.Element => |el| {
for (field_names) |field_name| {
if (std.mem.eql(u8, field_name, el.tag)) {
matching_fields += 1;
}
}
},
else => continue,
}
}
if (matching_fields > 0) {
return .repeated_root;
}
return .collection;
}
fn parseInternal(comptime T: type, element: *xml.Element, options: ParseOptions) !T {
switch (@typeInfo(T)) {
.bool => {
@ -330,23 +378,31 @@ fn parseInternal(comptime T: type, element: *xml.Element, options: ParseOptions)
// <Item>bar</Item>
// <Items>
if (ptr_info.child != u8) {
log.debug("type = {s}, ptr_info.child == {s}, element = {s}", .{ @typeName(T), @typeName(ptr_info.child), element.tag });
var iterator = element.elements();
const array_style = try detectArrayStyle(ptr_info.child, element, options);
std.log.debug("type = {s}, style = {s}, ptr_info.child == {s}, element = {s}", .{ @typeName(T), @tagName(array_style), @typeName(ptr_info.child), element.tag });
var children = std.ArrayList(ptr_info.child).init(allocator);
defer children.deinit();
switch (array_style) {
.collection => {
var iterator = element.elements();
while (iterator.next()) |child_element| {
try children.append(try parseInternal(ptr_info.child, child_element, options));
}
},
.repeated_root => {
var current: ?*Element = element;
while (current) |el| : (current = el.next_sibling) {
if (!std.mem.eql(u8, el.tag, element.tag)) continue;
try children.append(try parseInternal(ptr_info.child, el, options));
}
},
}
return children.toOwnedSlice();
// var inx: usize = 0;
// while (inx < children.len) {
// switch (element.children.items[inx]) {
// .Element => children[inx] = try parseInternal(ptr_info.child, element.children.items[inx].Element, options),
// .CharData => children[inx] = try allocator.dupe(u8, element.children.items[inx].CharData),
// .Comment => children[inx] = try allocator.dupe(u8, element.children.items[inx].Comment), // This might be an error...
// }
// inx += 1;
// }
}
return try allocator.dupe(u8, element.children.items[0].CharData);
},
@ -738,3 +794,33 @@ test "compiler assertion failure 2" {
defer parsed_data.deinit();
try testing.expect(parsed_data.parsed_value.key_group_list.?.quantity == 42);
}
test "can parse list objects" {
const data =
\\<?xml version="1.0" encoding="UTF-8"?>
\\<ListBucketResult>
\\ <Contents>
\\ <Key>file1.txt</Key>
\\ <Size>1024</Size>
\\ </Contents>
\\ <Contents>
\\ <Key>file2.jpg</Key>
\\ <Size>2048</Size>
\\ </Contents>
\\</ListBucketResult>
;
const Response = sm.s3.list_objects_v2.Response;
const parsed_data = try parse(Response, data, .{ .allocator = testing.allocator });
defer parsed_data.deinit();
const response: Response = parsed_data.parsed_value;
const s3_objects: []sm.s3.Object = response.contents.?;
try testing.expectEqual(2, s3_objects.len);
try testing.expectEqualStrings(s3_objects[0].key.?, "file1.txt");
try testing.expectEqualStrings(s3_objects[1].key.?, "file2.jpg");
try testing.expectEqual(s3_objects[0].size.?, 1024);
try testing.expectEqual(s3_objects[1].size.?, 2048);
}