// File sourced from: // https://github.com/Snektron/vulkan-zig/blob/797ae8af88e84753af9640266de61a985b76b580/generator/xml.zig const std = @import("std"); const mem = std.mem; const testing = std.testing; const Allocator = mem.Allocator; const ArenaAllocator = std.heap.ArenaAllocator; const ArrayList = std.ArrayList; pub const Attribute = struct { name: []const u8, value: []const u8, }; pub const Content = union(enum) { CharData: []const u8, Comment: []const u8, Element: *Element, }; pub const Element = struct { pub const AttributeList = ArrayList(*Attribute); pub const ContentList = ArrayList(Content); tag: []const u8, attributes: AttributeList, children: ContentList, fn init(tag: []const u8, alloc: Allocator) Element { return .{ .tag = tag, .attributes = AttributeList.init(alloc), .children = ContentList.init(alloc), }; } pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 { for (self.attributes.items) |child| { if (mem.eql(u8, child.name, attrib_name)) { return child.value; } } return null; } pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 { const child = (self.findChildByTag(child_tag) catch return null) orelse return null; if (child.children.items.len != 1) { return null; } return switch (child.children.items[0]) { .CharData => |char_data| char_data, else => null, }; } pub fn iterator(self: *Element) ChildIterator { return .{ .items = self.children.items, .i = 0, }; } pub fn elements(self: *Element) ChildElementIterator { return .{ .inner = self.iterator(), }; } pub fn findChildByTag(self: *Element, tag: []const u8) !?*Element { var it = self.findChildrenByTag(tag); return try it.next(); } pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator { return .{ .inner = self.elements(), .tag = tag, }; } pub const ChildIterator = struct { items: []Content, i: usize, pub fn next(self: *ChildIterator) ?*Content { if (self.i < self.items.len) { self.i += 1; return &self.items[self.i - 1]; } return null; } }; pub const ChildElementIterator = struct { inner: ChildIterator, pub fn next(self: *ChildElementIterator) ?*Element { while (self.inner.next()) |child| { if (child.* != .Element) { continue; } return child.*.Element; } return null; } }; fn strictEqual(a: []const u8, b: []const u8, _: PredicateOptions) !bool { return mem.eql(u8, a, b); } pub const FindChildrenByTagIterator = struct { inner: ChildElementIterator, tag: []const u8, predicate: *const fn (a: []const u8, b: []const u8, options: PredicateOptions) anyerror!bool = strictEqual, predicate_options: PredicateOptions = .{}, pub fn next(self: *FindChildrenByTagIterator) !?*Element { while (self.inner.next()) |child| { if (!try self.predicate(child.tag, self.tag, self.predicate_options)) { continue; } return child; } return null; } }; }; pub const PredicateOptions = struct { allocator: ?std.mem.Allocator = null, }; pub const XmlDecl = struct { version: []const u8, encoding: ?[]const u8, standalone: ?bool, }; pub const Document = struct { arena: ArenaAllocator, xml_decl: ?*XmlDecl, root: *Element, pub fn deinit(self: Document) void { var arena = self.arena; // Copy to stack so self can be taken by value. arena.deinit(); } }; const ParseContext = struct { source: []const u8, offset: usize, line: usize, column: usize, fn init(source: []const u8) ParseContext { return .{ .source = source, .offset = 0, .line = 0, .column = 0, }; } fn peek(self: *ParseContext) ?u8 { return if (self.offset < self.source.len) self.source[self.offset] else null; } fn consume(self: *ParseContext) !u8 { if (self.offset < self.source.len) { return self.consumeNoEof(); } return error.UnexpectedEof; } fn consumeNoEof(self: *ParseContext) u8 { std.debug.assert(self.offset < self.source.len); const c = self.source[self.offset]; self.offset += 1; if (c == '\n') { self.line += 1; self.column = 0; } else { self.column += 1; } return c; } fn eat(self: *ParseContext, char: u8) bool { self.expect(char) catch return false; return true; } fn expect(self: *ParseContext, expected: u8) !void { if (self.peek()) |actual| { if (expected != actual) { return error.UnexpectedCharacter; } _ = self.consumeNoEof(); return; } return error.UnexpectedEof; } fn eatStr(self: *ParseContext, text: []const u8) bool { self.expectStr(text) catch return false; return true; } fn expectStr(self: *ParseContext, text: []const u8) !void { if (self.source.len < self.offset + text.len) { return error.UnexpectedEof; } else if (std.mem.startsWith(u8, self.source[self.offset..], text)) { var i: usize = 0; while (i < text.len) : (i += 1) { _ = self.consumeNoEof(); } return; } return error.UnexpectedCharacter; } fn eatWs(self: *ParseContext) bool { var ws = false; while (self.peek()) |ch| { switch (ch) { ' ', '\t', '\n', '\r' => { ws = true; _ = self.consumeNoEof(); }, else => break, } } return ws; } fn expectWs(self: *ParseContext) !void { if (!self.eatWs()) return error.UnexpectedCharacter; } fn currentLine(self: ParseContext) []const u8 { var begin: usize = 0; if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| { begin = prev_nl + 1; } var end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len; return self.source[begin..end]; } }; test "ParseContext" { { var ctx = ParseContext.init("I like pythons"); try testing.expectEqual(@as(?u8, 'I'), ctx.peek()); try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); try testing.expectEqual(@as(?u8, ' '), ctx.peek()); try testing.expectEqual(@as(u8, ' '), try ctx.consume()); try testing.expect(ctx.eat('l')); try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); try testing.expectEqual(false, ctx.eat('a')); try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); try ctx.expect('i'); try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); try testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); try testing.expect(ctx.eatStr("ke")); try testing.expectEqual(@as(?u8, ' '), ctx.peek()); try testing.expect(ctx.eatWs()); try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(false, ctx.eatWs()); try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try ctx.expectStr("python"); try testing.expectEqual(@as(?u8, 's'), ctx.peek()); } { var ctx = ParseContext.init(""); try testing.expectEqual(ctx.peek(), null); try testing.expectError(error.UnexpectedEof, ctx.consume()); try testing.expectEqual(ctx.eat('p'), false); try testing.expectError(error.UnexpectedEof, ctx.expect('p')); } } pub const ParseError = error{ IllegalCharacter, UnexpectedEof, UnexpectedCharacter, UnclosedValue, UnclosedComment, InvalidName, InvalidEntity, InvalidStandaloneValue, NonMatchingClosingTag, InvalidDocument, OutOfMemory, }; pub fn parse(backing_allocator: Allocator, source: []const u8) !Document { var ctx = ParseContext.init(source); return try parseDocument(&ctx, backing_allocator); } fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document { var doc = Document{ .arena = ArenaAllocator.init(backing_allocator), .xml_decl = null, .root = undefined, }; errdefer doc.deinit(); const allocator = doc.arena.allocator(); try trySkipComments(ctx, allocator); doc.xml_decl = try tryParseProlog(ctx, allocator); _ = ctx.eatWs(); try trySkipComments(ctx, allocator); doc.root = (try tryParseElement(ctx, allocator)) orelse return error.InvalidDocument; _ = ctx.eatWs(); try trySkipComments(ctx, allocator); if (ctx.peek() != null) return error.InvalidDocument; return doc; } fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { const quote = try ctx.consume(); if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; const begin = ctx.offset; while (true) { const c = ctx.consume() catch return error.UnclosedValue; if (c == quote) break; } const end = ctx.offset - 1; return try dupeAndUnescape(alloc, ctx.source[begin..end]); } fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { _ = ctx.eatWs(); try ctx.expect('='); _ = ctx.eatWs(); return try parseAttrValue(ctx, alloc); } fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { // XML's spec on names is very long, so to make this easier // we just take any character that is not special and not whitespace const begin = ctx.offset; while (ctx.peek()) |ch| { switch (ch) { ' ', '\t', '\n', '\r' => break, '&', '"', '\'', '<', '>', '?', '=', '/' => break, else => _ = ctx.consumeNoEof(), } } const end = ctx.offset; if (begin == end) return error.InvalidName; return ctx.source[begin..end]; } fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { const begin = ctx.offset; while (ctx.peek()) |ch| { switch (ch) { '<' => break, else => _ = ctx.consumeNoEof(), } } const end = ctx.offset; if (begin == end) return null; return try dupeAndUnescape(alloc, ctx.source[begin..end]); } fn parseContent(ctx: *ParseContext, alloc: Allocator) ParseError!Content { if (try tryParseCharData(ctx, alloc)) |cd| { return Content{ .CharData = cd }; } else if (try tryParseComment(ctx, alloc)) |comment| { return Content{ .Comment = comment }; } else if (try tryParseElement(ctx, alloc)) |elem| { return Content{ .Element = elem }; } else { return error.UnexpectedCharacter; } } fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute { const name = parseNameNoDupe(ctx) catch return null; _ = ctx.eatWs(); try ctx.expect('='); _ = ctx.eatWs(); const value = try parseAttrValue(ctx, alloc); const attr = try alloc.create(Attribute); attr.name = try alloc.dupe(u8, name); attr.value = value; return attr; } fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element { const start = ctx.offset; if (!ctx.eat('<')) return null; const tag = parseNameNoDupe(ctx) catch { ctx.offset = start; return null; }; const element = try alloc.create(Element); element.* = Element.init(try alloc.dupe(u8, tag), alloc); while (ctx.eatWs()) { const attr = (try tryParseAttr(ctx, alloc)) orelse break; try element.attributes.append(attr); } if (ctx.eatStr("/>")) { return element; } try ctx.expect('>'); while (true) { if (ctx.peek() == null) { return error.UnexpectedEof; } else if (ctx.eatStr("'); return element; } test "tryParseElement" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); const alloc = arena.allocator(); { var ctx = ParseContext.init("<= a='b'/>"); try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc)); try testing.expectEqual(@as(?u8, '<'), ctx.peek()); } { var ctx = ParseContext.init(""); const elem = try tryParseElement(&ctx, alloc); try testing.expectEqualSlices(u8, elem.?.tag, "python"); const size_attr = elem.?.attributes.items[0]; try testing.expectEqualSlices(u8, size_attr.name, "size"); try testing.expectEqualSlices(u8, size_attr.value, "15"); const color_attr = elem.?.attributes.items[1]; try testing.expectEqualSlices(u8, color_attr.name, "color"); try testing.expectEqualSlices(u8, color_attr.value, "green"); } { var ctx = ParseContext.init("test"); const elem = try tryParseElement(&ctx, alloc); try testing.expectEqualSlices(u8, elem.?.tag, "python"); try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test"); } { var ctx = ParseContext.init("bdf"); const elem = try tryParseElement(&ctx, alloc); try testing.expectEqualSlices(u8, elem.?.tag, "a"); try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b"); try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c"); try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d"); try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e"); try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f"); try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g"); } } fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl { const start = ctx.offset; if (!ctx.eatStr(""); return decl; } test "tryParseProlog" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); const alloc = arena.allocator(); { var ctx = ParseContext.init(""); try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc)); try testing.expectEqual(@as(?u8, '<'), ctx.peek()); } { var ctx = ParseContext.init(""); const decl = try tryParseProlog(&ctx, alloc); try testing.expectEqualSlices(u8, "aa", decl.?.version); try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); try testing.expectEqual(@as(?bool, null), decl.?.standalone); } { var ctx = ParseContext.init(""); const decl = try tryParseProlog(&ctx, alloc); try testing.expectEqualSlices(u8, "aa", decl.?.version); try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); try testing.expectEqual(@as(?bool, true), decl.?.standalone.?); } } fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void { while (try tryParseComment(ctx, alloc)) |_| { _ = ctx.eatWs(); } } fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { if (!ctx.eatStr("")) { _ = ctx.consume() catch return error.UnclosedComment; } const end = ctx.offset - "-->".len; return try alloc.dupe(u8, ctx.source[begin..end]); } fn unescapeEntity(text: []const u8) !u8 { const EntitySubstition = struct { text: []const u8, replacement: u8 }; const entities = [_]EntitySubstition{ .{ .text = "<", .replacement = '<' }, .{ .text = ">", .replacement = '>' }, .{ .text = "&", .replacement = '&' }, .{ .text = "'", .replacement = '\'' }, .{ .text = """, .replacement = '"' }, }; for (entities) |entity| { if (std.mem.eql(u8, text, entity.text)) return entity.replacement; } return error.InvalidEntity; } fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 { const str = try alloc.alloc(u8, text.len); var j: usize = 0; var i: usize = 0; while (i < text.len) : (j += 1) { if (text[i] == '&') { const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity); str[j] = try unescapeEntity(text[i..entity_end]); i = entity_end; } else { str[j] = text[i]; i += 1; } } // This error is not strictly true, but we need to match one of the items // from the error set provided by the other stdlib calls at the calling site if (!alloc.resize(str, j)) return error.OutOfMemory; return str; } test "dupeAndUnescape" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); const alloc = arena.allocator(); try testing.expectEqualSlices(u8, "test", try dupeAndUnescape(alloc, "test")); try testing.expectEqualSlices(u8, "ad\"e'f<", try dupeAndUnescape(alloc, "a<b&c>d"e'f<")); try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&")); try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&")); try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;")); try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa")); } test "Top level comments" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); const alloc = arena.allocator(); const doc = try parse(alloc, ""); try testing.expectEqualSlices(u8, "python", doc.root.tag); }